# Load libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
from sklearn.model_selection import GridSearchCV
import re
import ipaddress
from IPython.display import display, HTML
from cyberpandas import IPArray, to_ipaddress
from datetime import datetime
import os
import time
import json
import csv
import plotly.express as px
import plotly.io as pio
%%HTML
<!-- Set dataframe style -->
<style>.dataframe th{
background:#3f577c;
font-family:monospace;
color:white;
border:3px solid white;
text-align:left !important;}
</style>
# Load raw data
t1 = time.perf_counter()
dir = 'C:/Users/micha/Downloads/flows'
ct = 0
for fn in os.listdir(dir):
# Build path and read data frame
ct += 1
fullpath = os.path.join(dir, fn)
print("Reading", fn)
df = pd.read_csv(fullpath, encoding='latin1', low_memory=False, converters={0: str, 1: str, 3: str, 6: str, 84: str})
#df = pd.read_csv(fullpath, encoding='latin1', low_memory=False)
# Shape
print("Shape before data wrangling:", df.shape)
# Most columns have a leading space in their name; strip all column names
for i in range(0, df.shape[1]):
df.rename({df.columns[i]: df.columns[i].strip()}, axis = 1, inplace = True)
# Rename first columns by hand
df.rename(columns = {'Source IP': 'srcip', 'Destination IP': 'dstip', 'Protocol': 'proto'}, inplace = True)
# Rename other columns to remove space and make lowercase
dRename = {
'source': 'src',
'destination': 'dst',
'protocol': 'proto',
'bytes/s': 'bps',
'packets/s': 'pps',
'packet': 'pkt',
'packets': 'pkt',
'length': 'len',
'header': 'hdr',
'total': 'tot',
'count': 'ct',
'average': 'avg',
'variance': 'var',
'size': 'sz',
'forward': 'fwd',
'backward': 'bwd',
'segment': 'seg',
' ': '_',
'/': '_',
'.': '_'
}
for i in range(0, df.shape[1]):
df.rename({df.columns[i]: df.columns[i].lower()}, axis = 1, inplace = True)
for e in dRename.keys():
df.rename({df.columns[i]: df.columns[i].replace(e, dRename[e])}, axis = 1, inplace = True)
# Add filename
df['fn'] = fn
# Append data to df2
if ct == 1:
df2 = df.copy()
else:
df2 = pd.concat([df2, df])
print("Shape after concatenating:", df2.shape)
# Remove the dataframe from memory
del df
print()
print("Done reading data")
Reading Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv Shape before data wrangling: (225745, 85) Shape after concatenating: (225745, 86) Reading Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv Shape before data wrangling: (286467, 85) Shape after concatenating: (512212, 86) Reading Friday-WorkingHours-Morning.pcap_ISCX.csv Shape before data wrangling: (191033, 85) Shape after concatenating: (703245, 86) Reading Monday-WorkingHours.pcap_ISCX.csv Shape before data wrangling: (529918, 85) Shape after concatenating: (1233163, 86) Reading Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv Shape before data wrangling: (288602, 85) Shape after concatenating: (1521765, 86) Reading Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv Shape before data wrangling: (170366, 85) Shape after concatenating: (1692131, 86) Reading Tuesday-WorkingHours.pcap_ISCX.csv Shape before data wrangling: (445909, 85) Shape after concatenating: (2138040, 86) Reading Wednesday-workingHours.pcap_ISCX.csv Shape before data wrangling: (692703, 85) Shape after concatenating: (2830743, 86) Done reading data
# Additional data wrangling on combined data frame
# Convert proto column to categorical
df2.loc[df2['proto'] == 1, 'proto'] = 'icmp'
df2.loc[df2['proto'] == 6, 'proto'] = 'tcp'
df2.loc[df2['proto'] == 17, 'proto'] = 'udp'
# Convert timestamp field from this format: "7/7/2017 8:59"
# to ISO 8601: "2017-07-07 08:59:00"
# "%d/%m/%Y %H:%M"
df2['timestamp'] = pd.to_datetime(df2['timestamp'])
# Replace '\x96' (en dash) in labels
df2['label'] = df2['label'].str.replace('\x96', '-', regex=False)
# Split out day of week
df2.loc[df2.fn.str.contains('^Sunday'), 'day'] = 'Sun'
df2.loc[df2.fn.str.contains('^Monday'), 'day'] = 'Mon'
df2.loc[df2.fn.str.contains('^Tuesday'), 'day'] = 'Tue'
df2.loc[df2.fn.str.contains('^Wednesday'), 'day'] = 'Wed'
df2.loc[df2.fn.str.contains('^Thursday'), 'day'] = 'Thu'
df2.loc[df2.fn.str.contains('^Friday'), 'day'] = 'Fri'
df2.loc[df2.fn.str.contains('^Saturday'), 'day'] = 'Sat'
# Get dummies for categorical variable "day"
tmpDays = pd.get_dummies(df2['day'], drop_first=True)
df2 = df2.drop(['day'], axis=1)
df2 = pd.concat([df2, tmpDays], axis=1)
# Drop records with protocol 0 or na
df2 = df2[df2['proto'] != 0]
# Get dummies for categorical variable "proto"
tmpProto = pd.get_dummies(df2['proto'], drop_first=True)
df2 = df2.drop(['proto'], axis=1)
df2 = pd.concat([df2, tmpProto], axis=1)
# Explore missing or unexpected data
display(df2)
# Explore missing data
dfmissing = df2[df2.isna().any(axis = 1)]
numMissing = df2.isna().any(axis = 1).sum()
print("# rows with na's: " + str(numMissing) + " (" + str(round(100 * numMissing / df2.shape[0], 3)) + "%)")
display(dfmissing)
| flow_id | srcip | src_port | dstip | dst_port | timestamp | flow_duration | tot_fwd_pkts | tot_bwd_pkts | tot_len_of_fwd_pkts | tot_len_of_bwd_pkts | fwd_pkt_len_max | fwd_pkt_len_min | fwd_pkt_len_mean | fwd_pkt_len_std | bwd_pkt_len_max | bwd_pkt_len_min | bwd_pkt_len_mean | bwd_pkt_len_std | flow_bps | flow_pps | flow_iat_mean | flow_iat_std | flow_iat_max | flow_iat_min | fwd_iat_tot | fwd_iat_mean | fwd_iat_std | fwd_iat_max | fwd_iat_min | bwd_iat_tot | bwd_iat_mean | bwd_iat_std | bwd_iat_max | bwd_iat_min | fwd_psh_flags | bwd_psh_flags | fwd_urg_flags | bwd_urg_flags | fwd_hdr_len | bwd_hdr_len | fwd_pps | bwd_pps | min_pkt_len | max_pkt_len | pkt_len_mean | pkt_len_std | pkt_len_var | fin_flag_ct | syn_flag_ct | rst_flag_ct | psh_flag_ct | ack_flag_ct | urg_flag_ct | cwe_flag_ct | ece_flag_ct | down_up_ratio | avg_pkt_sz | avg_fwd_seg_sz | avg_bwd_seg_sz | fwd_hdr_len_1 | fwd_avg_bytes_bulk | fwd_avg_pkts_bulk | fwd_avg_bulk_rate | bwd_avg_bytes_bulk | bwd_avg_pkts_bulk | bwd_avg_bulk_rate | subflow_fwd_pkts | subflow_fwd_bytes | subflow_bwd_pkts | subflow_bwd_bytes | init_win_bytes_fwd | init_win_bytes_bwd | act_data_pkt_fwd | min_seg_sz_fwd | active_mean | active_std | active_max | active_min | idle_mean | idle_std | idle_max | idle_min | label | fn | Mon | Thu | Tue | Wed | udp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 192.168.10.5-104.16.207.165-54865-443-6 | 104.16.207.165 | 443 | 192.168.10.5 | 54865 | 2017-07-07 03:30:00 | 3 | 2 | 0 | 12.0 | 0.0 | 6.0 | 6.0 | 6.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 4.000000e+06 | 666666.666700 | 3.0 | 0.000000 | 3.0 | 3.0 | 3.0 | 3.00000 | 0.00000 | 3.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 40 | 0 | 666666.666700 | 0.000000 | 6.0 | 6.0 | 6.000000 | 0.000000 | 0.000000 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0.0 | 9.000000 | 6.0 | 0.0 | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 12 | 0 | 0 | 33 | -1 | 1 | 20 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv | 0 | 0 | 0 | 0 | 0 |
| 1 | 192.168.10.5-104.16.28.216-55054-80-6 | 104.16.28.216 | 80 | 192.168.10.5 | 55054 | 2017-07-07 03:30:00 | 109 | 1 | 1 | 6.0 | 6.0 | 6.0 | 6.0 | 6.0 | 0.00000 | 6.0 | 6.0 | 6.0 | 0.0 | 1.100917e+05 | 18348.623850 | 109.0 | 0.000000 | 109.0 | 109.0 | 0.0 | 0.00000 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 20 | 20 | 9174.311927 | 9174.311927 | 6.0 | 6.0 | 6.000000 | 0.000000 | 0.000000 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1.0 | 9.000000 | 6.0 | 6.0 | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 6 | 1 | 6 | 29 | 256 | 0 | 20 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv | 0 | 0 | 0 | 0 | 0 |
| 2 | 192.168.10.5-104.16.28.216-55055-80-6 | 104.16.28.216 | 80 | 192.168.10.5 | 55055 | 2017-07-07 03:30:00 | 52 | 1 | 1 | 6.0 | 6.0 | 6.0 | 6.0 | 6.0 | 0.00000 | 6.0 | 6.0 | 6.0 | 0.0 | 2.307692e+05 | 38461.538460 | 52.0 | 0.000000 | 52.0 | 52.0 | 0.0 | 0.00000 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 20 | 20 | 19230.769230 | 19230.769230 | 6.0 | 6.0 | 6.000000 | 0.000000 | 0.000000 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1.0 | 9.000000 | 6.0 | 6.0 | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 6 | 1 | 6 | 29 | 256 | 0 | 20 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv | 0 | 0 | 0 | 0 | 0 |
| 3 | 192.168.10.16-104.17.241.25-46236-443-6 | 104.17.241.25 | 443 | 192.168.10.16 | 46236 | 2017-07-07 03:30:00 | 34 | 1 | 1 | 6.0 | 6.0 | 6.0 | 6.0 | 6.0 | 0.00000 | 6.0 | 6.0 | 6.0 | 0.0 | 3.529412e+05 | 58823.529410 | 34.0 | 0.000000 | 34.0 | 34.0 | 0.0 | 0.00000 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 20 | 20 | 29411.764710 | 29411.764710 | 6.0 | 6.0 | 6.000000 | 0.000000 | 0.000000 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1.0 | 9.000000 | 6.0 | 6.0 | 20 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 6 | 1 | 6 | 31 | 329 | 0 | 20 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv | 0 | 0 | 0 | 0 | 0 |
| 4 | 192.168.10.5-104.19.196.102-54863-443-6 | 104.19.196.102 | 443 | 192.168.10.5 | 54863 | 2017-07-07 03:30:00 | 3 | 2 | 0 | 12.0 | 0.0 | 6.0 | 6.0 | 6.0 | 0.00000 | 0.0 | 0.0 | 0.0 | 0.0 | 4.000000e+06 | 666666.666700 | 3.0 | 0.000000 | 3.0 | 3.0 | 3.0 | 3.00000 | 0.00000 | 3.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 40 | 0 | 666666.666700 | 0.000000 | 6.0 | 6.0 | 6.000000 | 0.000000 | 0.000000 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0.0 | 9.000000 | 6.0 | 0.0 | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 12 | 0 | 0 | 32 | -1 | 1 | 20 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 692698 | 192.168.10.3-192.168.10.14-53-51114-17 | 192.168.10.14 | 51114 | 192.168.10.3 | 53 | 2017-05-07 12:10:00 | 32215 | 4 | 2 | 112.0 | 152.0 | 28.0 | 28.0 | 28.0 | 0.00000 | 76.0 | 76.0 | 76.0 | 0.0 | 8.194940e+03 | 186.248642 | 6443.0 | 13617.579480 | 30780.0 | 3.0 | 30832.0 | 10277.33333 | 17755.84381 | 30780.0 | 4.0 | 3.0 | 3.0 | 0.0 | 3.0 | 3.0 | 0 | 0 | 0 | 0 | 80 | 64 | 124.165761 | 62.082881 | 28.0 | 76.0 | 41.714286 | 23.421602 | 548.571429 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 48.666667 | 28.0 | 76.0 | 80 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 112 | 2 | 152 | -1 | -1 | 3 | 20 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Wednesday-workingHours.pcap_ISCX.csv | 0 | 0 | 0 | 1 | 1 |
| 692699 | 192.168.10.3-192.168.10.16-53-24054-17 | 192.168.10.16 | 24054 | 192.168.10.3 | 53 | 2017-05-07 03:02:00 | 324 | 2 | 2 | 84.0 | 362.0 | 42.0 | 42.0 | 42.0 | 0.00000 | 181.0 | 181.0 | 181.0 | 0.0 | 1.376543e+06 | 12345.679010 | 108.0 | 183.597386 | 320.0 | 2.0 | 2.0 | 2.00000 | 0.00000 | 2.0 | 2.0 | 2.0 | 2.0 | 0.0 | 2.0 | 2.0 | 0 | 0 | 0 | 0 | 40 | 40 | 6172.839506 | 6172.839506 | 42.0 | 181.0 | 97.600000 | 76.133435 | 5796.300000 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1.0 | 122.000000 | 42.0 | 181.0 | 40 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 84 | 2 | 362 | -1 | -1 | 1 | 20 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Wednesday-workingHours.pcap_ISCX.csv | 0 | 0 | 0 | 1 | 1 |
| 692700 | 192.168.10.51-23.208.163.130-58030-443-6 | 23.208.163.130 | 443 | 192.168.10.51 | 58030 | 2017-05-07 10:06:00 | 82 | 2 | 1 | 31.0 | 6.0 | 31.0 | 0.0 | 15.5 | 21.92031 | 6.0 | 6.0 | 6.0 | 0.0 | 4.512195e+05 | 36585.365850 | 41.0 | 52.325902 | 78.0 | 4.0 | 4.0 | 4.00000 | 0.00000 | 4.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | 0 | 0 | 0 | 64 | 20 | 24390.243900 | 12195.121950 | 0.0 | 31.0 | 17.000000 | 16.350331 | 267.333333 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0.0 | 22.666667 | 15.5 | 6.0 | 64 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 31 | 1 | 6 | 1006 | 0 | 0 | 32 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Wednesday-workingHours.pcap_ISCX.csv | 0 | 0 | 0 | 1 | 0 |
| 692701 | 192.168.10.3-192.168.10.14-53-51694-17 | 192.168.10.14 | 51694 | 192.168.10.3 | 53 | 2017-05-07 01:19:00 | 1048635 | 6 | 2 | 192.0 | 256.0 | 32.0 | 32.0 | 32.0 | 0.00000 | 128.0 | 128.0 | 128.0 | 0.0 | 4.272221e+02 | 7.628965 | 149805.0 | 375521.040500 | 1000947.0 | 1.0 | 1033613.0 | 206722.60000 | 444210.06860 | 1000947.0 | 1.0 | 3.0 | 3.0 | 0.0 | 3.0 | 3.0 | 0 | 0 | 0 | 0 | 120 | 40 | 5.721724 | 1.907241 | 32.0 | 128.0 | 53.333333 | 42.332021 | 1792.000000 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 60.000000 | 32.0 | 128.0 | 120 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | 192 | 2 | 256 | -1 | -1 | 5 | 20 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Wednesday-workingHours.pcap_ISCX.csv | 0 | 0 | 0 | 1 | 1 |
| 692702 | 192.168.10.3-192.168.10.14-53-57949-17 | 192.168.10.14 | 57949 | 192.168.10.3 | 53 | 2017-05-07 02:43:00 | 94939 | 4 | 2 | 188.0 | 226.0 | 47.0 | 47.0 | 47.0 | 0.00000 | 113.0 | 113.0 | 113.0 | 0.0 | 4.360695e+03 | 63.198475 | 18987.8 | 31664.102560 | 73049.0 | 1.0 | 73051.0 | 24350.33333 | 42174.28246 | 73049.0 | 1.0 | 48.0 | 48.0 | 0.0 | 48.0 | 48.0 | 0 | 0 | 0 | 0 | 104 | 64 | 42.132317 | 21.066158 | 47.0 | 113.0 | 65.857143 | 32.204702 | 1037.142857 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 76.833333 | 47.0 | 113.0 | 104 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 188 | 2 | 226 | -1 | -1 | 3 | 20 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Wednesday-workingHours.pcap_ISCX.csv | 0 | 0 | 0 | 1 | 1 |
2829047 rows × 90 columns
# rows with na's: 1358 (0.048%)
| flow_id | srcip | src_port | dstip | dst_port | timestamp | flow_duration | tot_fwd_pkts | tot_bwd_pkts | tot_len_of_fwd_pkts | tot_len_of_bwd_pkts | fwd_pkt_len_max | fwd_pkt_len_min | fwd_pkt_len_mean | fwd_pkt_len_std | bwd_pkt_len_max | bwd_pkt_len_min | bwd_pkt_len_mean | bwd_pkt_len_std | flow_bps | flow_pps | flow_iat_mean | flow_iat_std | flow_iat_max | flow_iat_min | fwd_iat_tot | fwd_iat_mean | fwd_iat_std | fwd_iat_max | fwd_iat_min | bwd_iat_tot | bwd_iat_mean | bwd_iat_std | bwd_iat_max | bwd_iat_min | fwd_psh_flags | bwd_psh_flags | fwd_urg_flags | bwd_urg_flags | fwd_hdr_len | bwd_hdr_len | fwd_pps | bwd_pps | min_pkt_len | max_pkt_len | pkt_len_mean | pkt_len_std | pkt_len_var | fin_flag_ct | syn_flag_ct | rst_flag_ct | psh_flag_ct | ack_flag_ct | urg_flag_ct | cwe_flag_ct | ece_flag_ct | down_up_ratio | avg_pkt_sz | avg_fwd_seg_sz | avg_bwd_seg_sz | fwd_hdr_len_1 | fwd_avg_bytes_bulk | fwd_avg_pkts_bulk | fwd_avg_bulk_rate | bwd_avg_bytes_bulk | bwd_avg_pkts_bulk | bwd_avg_bulk_rate | subflow_fwd_pkts | subflow_fwd_bytes | subflow_bwd_pkts | subflow_bwd_bytes | init_win_bytes_fwd | init_win_bytes_bwd | act_data_pkt_fwd | min_seg_sz_fwd | active_mean | active_std | active_max | active_min | idle_mean | idle_std | idle_max | idle_min | label | fn | Mon | Thu | Tue | Wed | udp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6796 | 192.168.10.16-198.54.12.145-36812-80-6 | 198.54.12.145 | 80 | 192.168.10.16 | 36812 | 2017-07-07 03:35:00 | 0 | 2 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | inf | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 64 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 64 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 7633 | -1 | 0 | 32 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv | 0 | 0 | 0 | 0 | 0 |
| 14739 | 192.168.10.25-192.168.10.50-53581-37575-6 | 192.168.10.50 | 37575 | 192.168.10.25 | 53581 | 2017-07-07 03:46:00 | 0 | 1 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | inf | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 44 | 32 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 44 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 408 | 65535 | 0 | 44 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv | 0 | 0 | 0 | 0 | 0 |
| 15047 | 192.168.10.17-1.1.70.73-48283-80-6 | 192.168.10.17 | 48283 | 1.1.70.73 | 80 | 2017-07-07 03:48:00 | 0 | 2 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | inf | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 64 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 64 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 274 | -1 | 0 | 32 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv | 0 | 0 | 0 | 0 | 0 |
| 209728 | 192.168.10.17-192.168.10.50-39026-18467-6 | 192.168.10.17 | 39026 | 192.168.10.50 | 18467 | 2017-07-07 04:34:00 | 0 | 2 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | inf | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 64 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 64 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 229 | -1 | 0 | 32 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv | 0 | 0 | 0 | 0 | 0 |
| 12824 | 192.168.10.25-192.168.10.50-52509-13370-6 | 192.168.10.25 | 52509 | 192.168.10.50 | 13370 | 2017-07-07 01:30:00 | 0 | 2 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | inf | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 64 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 64 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 65535 | -1 | 0 | 32 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Friday-WorkingHours-Afternoon-PortScan.pcap_IS... | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 602009 | 172.217.10.34-192.168.10.25-443-50834-6 | 172.217.10.34 | 443 | 192.168.10.25 | 50834 | 2017-05-07 10:59:00 | 0 | 1 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | inf | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 32 | 32 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 32 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 426 | 65535 | 0 | 32 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Wednesday-workingHours.pcap_ISCX.csv | 0 | 0 | 0 | 1 | 0 |
| 629492 | 172.217.12.162-192.168.10.17-443-45113-6 | 172.217.12.162 | 443 | 192.168.10.17 | 45113 | 2017-05-07 01:33:00 | 0 | 1 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | inf | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 32 | 32 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 32 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 413 | 535 | 0 | 32 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Wednesday-workingHours.pcap_ISCX.csv | 0 | 0 | 0 | 1 | 0 |
| 653553 | 160.68.117.59-192.168.10.12-443-48698-6 | 160.68.117.59 | 443 | 192.168.10.12 | 48698 | 2017-05-07 10:10:00 | 0 | 2 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | inf | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 64 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 64 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 5072 | -1 | 0 | 32 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Wednesday-workingHours.pcap_ISCX.csv | 0 | 0 | 0 | 1 | 0 |
| 671012 | 192.168.10.50-192.168.10.51-22567-46394-6 | 192.168.10.51 | 46394 | 192.168.10.50 | 22567 | 2017-05-07 11:25:00 | 0 | 2 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | inf | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 64 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 64 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 229 | -1 | 0 | 32 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Wednesday-workingHours.pcap_ISCX.csv | 0 | 0 | 0 | 1 | 0 |
| 686806 | 185.86.139.29-192.168.10.16-443-33238-6 | 185.86.139.29 | 443 | 192.168.10.16 | 33238 | 2017-05-07 09:55:00 | 0 | 1 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | inf | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 32 | 32 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 0.0 | 32 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 7635 | 40764 | 0 | 32 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | BENIGN | Wednesday-workingHours.pcap_ISCX.csv | 0 | 0 | 0 | 1 | 0 |
1358 rows × 90 columns
# Look for infinite values (which PCA will choke on)
for i in range(0, df2.shape[1]):
t = df2.dtypes[i]
if t == 'int64' or t == 'float64':
numInf = np.isinf(df2.iloc[:,i]).sum()
if numInf > 0:
print(numInf, "infinite values in column", df2.columns[i])
df2 = df2[df2[df2.columns[i]] != np.inf]
1509 infinite values in column flow_bps 1358 infinite values in column flow_pps
# Handle missing data
# Drop rows with NaNs
print("Shape before dropping rows with NaNs:", df2.shape)
df2.dropna(inplace = True)
print("Shape after dropping rows with NaNs:", df2.shape)
Shape before dropping rows with NaNs: (2826180, 90) Shape after dropping rows with NaNs: (2826180, 90)
# Field info
pd.options.display.max_columns = df2.shape[1]
print("Data frame info:")
print(df2.info())
print()
# Categories of categorical variables
print("Label categories:")
print(df2['label'].unique())
print()
# Summary stats
print("Summary stats:")
print(df2.describe(include='all', datetime_is_numeric=True))
print()
Data frame info:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2826180 entries, 0 to 692702
Data columns (total 90 columns):
# Column Dtype
--- ------ -----
0 flow_id object
1 srcip object
2 src_port int64
3 dstip object
4 dst_port int64
5 timestamp datetime64[ns]
6 flow_duration int64
7 tot_fwd_pkts int64
8 tot_bwd_pkts int64
9 tot_len_of_fwd_pkts float64
10 tot_len_of_bwd_pkts float64
11 fwd_pkt_len_max float64
12 fwd_pkt_len_min float64
13 fwd_pkt_len_mean float64
14 fwd_pkt_len_std float64
15 bwd_pkt_len_max float64
16 bwd_pkt_len_min float64
17 bwd_pkt_len_mean float64
18 bwd_pkt_len_std float64
19 flow_bps float64
20 flow_pps float64
21 flow_iat_mean float64
22 flow_iat_std float64
23 flow_iat_max float64
24 flow_iat_min float64
25 fwd_iat_tot float64
26 fwd_iat_mean float64
27 fwd_iat_std float64
28 fwd_iat_max float64
29 fwd_iat_min float64
30 bwd_iat_tot float64
31 bwd_iat_mean float64
32 bwd_iat_std float64
33 bwd_iat_max float64
34 bwd_iat_min float64
35 fwd_psh_flags int64
36 bwd_psh_flags int64
37 fwd_urg_flags int64
38 bwd_urg_flags int64
39 fwd_hdr_len int64
40 bwd_hdr_len int64
41 fwd_pps float64
42 bwd_pps float64
43 min_pkt_len float64
44 max_pkt_len float64
45 pkt_len_mean float64
46 pkt_len_std float64
47 pkt_len_var float64
48 fin_flag_ct int64
49 syn_flag_ct int64
50 rst_flag_ct int64
51 psh_flag_ct int64
52 ack_flag_ct int64
53 urg_flag_ct int64
54 cwe_flag_ct int64
55 ece_flag_ct int64
56 down_up_ratio float64
57 avg_pkt_sz float64
58 avg_fwd_seg_sz float64
59 avg_bwd_seg_sz float64
60 fwd_hdr_len_1 int64
61 fwd_avg_bytes_bulk int64
62 fwd_avg_pkts_bulk int64
63 fwd_avg_bulk_rate int64
64 bwd_avg_bytes_bulk int64
65 bwd_avg_pkts_bulk int64
66 bwd_avg_bulk_rate int64
67 subflow_fwd_pkts int64
68 subflow_fwd_bytes int64
69 subflow_bwd_pkts int64
70 subflow_bwd_bytes int64
71 init_win_bytes_fwd int64
72 init_win_bytes_bwd int64
73 act_data_pkt_fwd int64
74 min_seg_sz_fwd int64
75 active_mean float64
76 active_std float64
77 active_max float64
78 active_min float64
79 idle_mean float64
80 idle_std float64
81 idle_max float64
82 idle_min float64
83 label object
84 fn object
85 Mon uint8
86 Thu uint8
87 Tue uint8
88 Wed uint8
89 udp uint8
dtypes: datetime64[ns](1), float64(45), int64(34), object(5), uint8(5)
memory usage: 1.8+ GB
None
Label categories:
['BENIGN' 'DDoS' 'PortScan' 'Bot' 'Infiltration'
'Web Attack - Brute Force' 'Web Attack - XSS'
'Web Attack - Sql Injection' 'FTP-Patator' 'SSH-Patator' 'DoS slowloris'
'DoS Slowhttptest' 'DoS Hulk' 'DoS GoldenEye' 'Heartbleed']
Summary stats:
flow_id srcip src_port \
count 2826180 2826180 2.826180e+06
unique 1084509 16990 NaN
top 192.168.10.255-192.168.10.3-137-137-17 172.16.0.1 NaN
freq 523 558141 NaN
mean NaN NaN 4.116262e+04
min NaN NaN 1.000000e+00
25% NaN NaN 3.283000e+04
50% NaN NaN 5.095800e+04
75% NaN NaN 5.842500e+04
max NaN NaN 6.553500e+04
std NaN NaN 2.227550e+04
dstip dst_port timestamp \
count 2826180 2.826180e+06 2826180
unique 19041 NaN NaN
top 192.168.10.3 NaN NaN
freq 685128 NaN NaN
mean NaN 8.066371e+03 2017-05-11 07:42:06.019697152
min NaN 1.000000e+00 2017-03-07 01:00:01
25% NaN 5.300000e+01 2017-04-07 04:25:00
50% NaN 8.000000e+01 2017-05-07 10:48:00
75% NaN 4.430000e+02 2017-06-07 12:44:00
max NaN 6.553500e+04 2017-07-07 12:59:00
std NaN 1.827873e+04 NaN
flow_duration tot_fwd_pkts tot_bwd_pkts tot_len_of_fwd_pkts \
count 2.826180e+06 2.826180e+06 2.826180e+06 2.826180e+06
unique NaN NaN NaN NaN
top NaN NaN NaN NaN
freq NaN NaN NaN NaN
mean 1.475845e+07 9.311210e+00 1.041004e+01 5.501821e+02
min -1.300000e+01 1.000000e+00 0.000000e+00 0.000000e+00
25% 1.550000e+02 2.000000e+00 1.000000e+00 1.200000e+01
50% 3.132900e+04 2.000000e+00 2.000000e+00 6.200000e+01
75% 3.193939e+06 5.000000e+00 4.000000e+00 1.880000e+02
max 1.200000e+08 2.197590e+05 2.919220e+05 1.290000e+07
std 3.361178e+07 7.502717e+02 9.981930e+02 1.000163e+04
tot_len_of_bwd_pkts fwd_pkt_len_max fwd_pkt_len_min \
count 2.826180e+06 2.826180e+06 2.826180e+06
unique NaN NaN NaN
top NaN NaN NaN
freq NaN NaN NaN
mean 1.618874e+04 2.079291e+02 1.874053e+01
min 0.000000e+00 0.000000e+00 0.000000e+00
25% 4.000000e+00 6.000000e+00 0.000000e+00
50% 1.240000e+02 3.700000e+01 2.000000e+00
75% 4.840000e+02 8.200000e+01 3.600000e+01
max 6.554530e+08 2.482000e+04 2.325000e+03
std 2.264914e+06 7.177155e+02 6.037169e+01
fwd_pkt_len_mean fwd_pkt_len_std bwd_pkt_len_max bwd_pkt_len_min \
count 2.826180e+06 2.826180e+06 2.826180e+06 2.826180e+06
unique NaN NaN NaN NaN
top NaN NaN NaN NaN
freq NaN NaN NaN NaN
mean 5.829124e+01 6.901951e+01 8.722535e+02 4.111387e+01
min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
25% 6.000000e+00 0.000000e+00 2.000000e+00 0.000000e+00
50% 3.400000e+01 0.000000e+00 8.000000e+01 0.000000e+00
75% 5.000000e+01 2.616295e+01 2.820000e+02 7.700000e+01
max 5.940857e+03 7.125597e+03 1.953000e+04 2.896000e+03
std 1.862237e+02 2.814005e+02 1.947624e+03 6.889449e+01
bwd_pkt_len_mean bwd_pkt_len_std flow_bps flow_pps \
count 2.826180e+06 2.826180e+06 2.826180e+06 2.826180e+06
unique NaN NaN NaN NaN
top NaN NaN NaN NaN
freq NaN NaN NaN NaN
mean 3.064413e+02 3.358671e+02 1.492614e+06 7.087142e+04
min 0.000000e+00 0.000000e+00 -2.610000e+08 -2.000000e+06
25% 2.000000e+00 0.000000e+00 1.193804e+02 3.456822e+00
50% 7.200000e+01 0.000000e+00 4.596715e+03 1.099913e+02
75% 1.813333e+02 7.860195e+01 1.666667e+05 2.325581e+04
max 5.800500e+03 8.194660e+03 2.071000e+09 4.000000e+06
std 6.056207e+02 8.402626e+02 2.594791e+07 2.544386e+05
flow_iat_mean flow_iat_std flow_iat_max flow_iat_min fwd_iat_tot \
count 2.826180e+06 2.826180e+06 2.826180e+06 2.826180e+06 2.826180e+06
unique NaN NaN NaN NaN NaN
top NaN NaN NaN NaN NaN
freq NaN NaN NaN NaN NaN
mean 1.299647e+06 2.922084e+06 9.188572e+06 1.625879e+05 1.445528e+07
min -1.300000e+01 0.000000e+00 -1.300000e+01 -1.400000e+01 0.000000e+00
25% 6.400000e+01 0.000000e+00 1.240000e+02 3.000000e+00 0.000000e+00
50% 1.150593e+04 1.378151e+02 3.087100e+04 4.000000e+00 4.500000e+01
75% 3.368703e+05 6.891644e+05 2.427224e+06 6.400000e+01 1.236743e+06
max 1.200000e+08 8.480026e+07 1.200000e+08 1.200000e+08 1.200000e+08
std 4.510501e+06 8.050574e+06 2.447515e+07 2.952431e+06 3.353336e+07
fwd_iat_mean fwd_iat_std fwd_iat_max fwd_iat_min bwd_iat_tot \
count 2.826180e+06 2.826180e+06 2.826180e+06 2.826180e+06 2.826180e+06
unique NaN NaN NaN NaN NaN
top NaN NaN NaN NaN NaN
freq NaN NaN NaN NaN NaN
mean 2.613465e+06 3.270332e+06 9.048814e+06 1.023449e+06 9.909463e+06
min 0.000000e+00 0.000000e+00 0.000000e+00 -1.200000e+01 0.000000e+00
25% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
50% 2.900000e+01 0.000000e+00 4.400000e+01 3.000000e+00 3.000000e+00
75% 2.058253e+05 6.572025e+04 9.252650e+05 4.800000e+01 9.963325e+04
max 1.200000e+08 8.460293e+07 1.200000e+08 1.200000e+08 1.200000e+08
std 9.532443e+06 9.645259e+06 2.454487e+07 8.598004e+06 2.875685e+07
bwd_iat_mean bwd_iat_std bwd_iat_max bwd_iat_min fwd_psh_flags \
count 2.826180e+06 2.826180e+06 2.826180e+06 2.826180e+06 2.826180e+06
unique NaN NaN NaN NaN NaN
top NaN NaN NaN NaN NaN
freq NaN NaN NaN NaN NaN
mean 1.808645e+06 1.488292e+06 4.692064e+06 9.688127e+05 4.642379e-02
min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
25% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
50% 3.000000e+00 0.000000e+00 3.000000e+00 1.000000e+00 0.000000e+00
75% 1.843982e+04 1.590471e+04 6.086825e+04 4.500000e+01 0.000000e+00
max 1.200000e+08 8.441801e+07 1.200000e+08 1.200000e+08 1.000000e+00
std 8.894058e+06 6.283144e+06 1.717356e+07 8.315596e+06 2.104011e-01
bwd_psh_flags fwd_urg_flags bwd_urg_flags fwd_hdr_len \
count 2826180.0 2.826180e+06 2826180.0 2.826180e+06
unique NaN NaN NaN NaN
top NaN NaN NaN NaN
freq NaN NaN NaN NaN
mean 0.0 1.114579e-04 0.0 -2.603941e+04
min 0.0 0.000000e+00 0.0 -3.221223e+10
25% 0.0 0.000000e+00 0.0 4.000000e+01
50% 0.0 0.000000e+00 0.0 6.400000e+01
75% 0.0 0.000000e+00 0.0 1.200000e+02
max 0.0 1.000000e+00 0.0 4.644908e+06
std 0.0 1.055677e-02 0.0 2.106985e+07
bwd_hdr_len fwd_pps bwd_pps min_pkt_len max_pkt_len \
count 2.826180e+06 2.826180e+06 2.826180e+06 2.826180e+06 2.826180e+06
unique NaN NaN NaN NaN NaN
top NaN NaN NaN NaN NaN
freq NaN NaN NaN NaN NaN
mean -2.276954e+03 6.394313e+04 7.006481e+03 1.645875e+01 9.519301e+02
min -1.073741e+09 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
25% 2.000000e+01 1.774760e+00 1.270092e-01 0.000000e+00 6.000000e+00
50% 4.000000e+01 6.168270e+01 1.990604e+01 2.000000e+00 8.800000e+01
75% 1.040000e+02 1.204819e+04 7.380074e+03 3.600000e+01 5.330000e+02
max 5.838440e+06 3.000000e+06 2.000000e+06 1.448000e+03 2.482000e+04
std 1.453381e+06 2.476743e+05 3.818145e+04 2.525050e+01 2.029509e+03
pkt_len_mean pkt_len_std pkt_len_var fin_flag_ct syn_flag_ct \
count 2.826180e+06 2.826180e+06 2.826180e+06 2.826180e+06 2.826180e+06
unique NaN NaN NaN NaN NaN
top NaN NaN NaN NaN NaN
freq NaN NaN NaN NaN NaN
mean 1.722170e+02 2.954492e+02 4.869389e+05 3.530384e-02 4.642379e-02
min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
25% 6.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
50% 5.733333e+01 2.629068e+01 6.912000e+02 0.000000e+00 0.000000e+00
75% 1.200000e+02 1.756193e+02 3.084213e+04 0.000000e+00 0.000000e+00
max 3.337143e+03 4.731522e+03 2.240000e+07 1.000000e+00 1.000000e+00
std 3.056611e+02 6.321992e+02 1.648703e+06 1.845467e-01 2.104011e-01
rst_flag_ct psh_flag_ct ack_flag_ct urg_flag_ct cwe_flag_ct \
count 2.826180e+06 2.826180e+06 2.826180e+06 2.826180e+06 2.826180e+06
unique NaN NaN NaN NaN NaN
top NaN NaN NaN NaN NaN
freq NaN NaN NaN NaN NaN
mean 2.427305e-04 2.984626e-01 3.155040e-01 9.488638e-02 1.114579e-04
min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
25% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
50% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
75% 0.000000e+00 1.000000e+00 1.000000e+00 0.000000e+00 0.000000e+00
max 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
std 1.557792e-02 4.575836e-01 4.647164e-01 2.930580e-01 1.055677e-02
ece_flag_ct down_up_ratio avg_pkt_sz avg_fwd_seg_sz \
count 2.826180e+06 2.826180e+06 2.826180e+06 2.826180e+06
unique NaN NaN NaN NaN
top NaN NaN NaN NaN
freq NaN NaN NaN NaN
mean 2.437920e-04 6.842268e-01 1.922860e+02 5.829124e+01
min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
25% 0.000000e+00 0.000000e+00 7.750000e+00 6.000000e+00
50% 0.000000e+00 1.000000e+00 7.250000e+01 3.400000e+01
75% 0.000000e+00 1.000000e+00 1.495000e+02 5.000000e+01
max 1.000000e+00 1.560000e+02 3.893333e+03 5.940857e+03
std 1.561194e-02 6.805770e-01 3.320397e+02 1.862237e+02
avg_bwd_seg_sz fwd_hdr_len_1 fwd_avg_bytes_bulk fwd_avg_pkts_bulk \
count 2.826180e+06 2.826180e+06 2826180.0 2826180.0
unique NaN NaN NaN NaN
top NaN NaN NaN NaN
freq NaN NaN NaN NaN
mean 3.064413e+02 -2.603941e+04 0.0 0.0
min 0.000000e+00 -3.221223e+10 0.0 0.0
25% 2.000000e+00 4.000000e+01 0.0 0.0
50% 7.200000e+01 6.400000e+01 0.0 0.0
75% 1.813333e+02 1.200000e+02 0.0 0.0
max 5.800500e+03 4.644908e+06 0.0 0.0
std 6.056207e+02 2.106985e+07 0.0 0.0
fwd_avg_bulk_rate bwd_avg_bytes_bulk bwd_avg_pkts_bulk \
count 2826180.0 2826180.0 2826180.0
unique NaN NaN NaN
top NaN NaN NaN
freq NaN NaN NaN
mean 0.0 0.0 0.0
min 0.0 0.0 0.0
25% 0.0 0.0 0.0
50% 0.0 0.0 0.0
75% 0.0 0.0 0.0
max 0.0 0.0 0.0
std 0.0 0.0 0.0
bwd_avg_bulk_rate subflow_fwd_pkts subflow_fwd_bytes \
count 2826180.0 2.826180e+06 2.826180e+06
unique NaN NaN NaN
top NaN NaN NaN
freq NaN NaN NaN
mean 0.0 9.311210e+00 5.501716e+02
min 0.0 1.000000e+00 0.000000e+00
25% 0.0 2.000000e+00 1.200000e+01
50% 0.0 2.000000e+00 6.200000e+01
75% 0.0 5.000000e+00 1.880000e+02
max 0.0 2.197590e+05 1.287034e+07
std 0.0 7.502717e+02 9.988100e+03
subflow_bwd_pkts subflow_bwd_bytes init_win_bytes_fwd \
count 2.826180e+06 2.826180e+06 2.826180e+06
unique NaN NaN NaN
top NaN NaN NaN
freq NaN NaN NaN
mean 1.041004e+01 1.618840e+04 6.996586e+03
min 0.000000e+00 0.000000e+00 -1.000000e+00
25% 1.000000e+00 4.000000e+00 -1.000000e+00
50% 2.000000e+00 1.240000e+02 2.510000e+02
75% 4.000000e+00 4.840000e+02 8.192000e+03
max 2.919220e+05 6.554530e+08 6.553500e+04
std 9.981930e+02 2.264883e+06 1.434350e+04
init_win_bytes_bwd act_data_pkt_fwd min_seg_sz_fwd active_mean \
count 2.826180e+06 2.826180e+06 2.826180e+06 2.826180e+06
unique NaN NaN NaN NaN
top NaN NaN NaN NaN
freq NaN NaN NaN NaN
mean 1.989484e+03 5.426774e+00 -2.746141e+03 7.918279e+04
min -1.000000e+00 0.000000e+00 -5.368707e+08 0.000000e+00
25% -1.000000e+00 0.000000e+00 2.000000e+01 0.000000e+00
50% -1.000000e+00 1.000000e+00 2.400000e+01 0.000000e+00
75% 2.350000e+02 2.000000e+00 3.200000e+01 0.000000e+00
max 6.553500e+04 2.135570e+05 1.380000e+02 1.100000e+08
std 8.456933e+03 6.369392e+02 1.085865e+06 6.349001e+05
active_std active_max active_min idle_mean idle_std \
count 2.826180e+06 2.826180e+06 2.826180e+06 2.826180e+06 2.826180e+06
unique NaN NaN NaN NaN NaN
top NaN NaN NaN NaN NaN
freq NaN NaN NaN NaN NaN
mean 3.867159e+04 1.466043e+05 5.820398e+04 8.323819e+06 5.028549e+05
min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
25% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
50% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
75% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
max 7.420000e+07 1.100000e+08 1.100000e+08 1.200000e+08 7.690000e+07
std 3.688951e+05 9.599700e+05 5.767597e+05 2.364610e+07 4.605474e+06
idle_max idle_min label \
count 2.826180e+06 2.826180e+06 2826180
unique NaN NaN 15
top NaN NaN BENIGN
freq NaN NaN 2269630
mean 8.701092e+06 7.928897e+06 NaN
min 0.000000e+00 0.000000e+00 NaN
25% 0.000000e+00 0.000000e+00 NaN
50% 0.000000e+00 0.000000e+00 NaN
75% 0.000000e+00 0.000000e+00 NaN
max 1.200000e+08 1.200000e+08 NaN
std 2.438251e+07 2.337941e+07 NaN
fn Mon Thu \
count 2826180 2.826180e+06 2.826180e+06
unique 8 NaN NaN
top Wednesday-workingHours.pcap_ISCX.csv NaN NaN
freq 691076 NaN NaN
mean NaN 1.872365e-01 1.621270e-01
min NaN 0.000000e+00 0.000000e+00
25% NaN 0.000000e+00 0.000000e+00
50% NaN 0.000000e+00 0.000000e+00
75% NaN 0.000000e+00 0.000000e+00
max NaN 1.000000e+00 1.000000e+00
std NaN 3.901013e-01 3.685673e-01
Tue Wed udp
count 2.826180e+06 2.826180e+06 2.826180e+06
unique NaN NaN NaN
top NaN NaN NaN
freq NaN NaN NaN
mean 1.575728e-01 2.445265e-01 3.536491e-01
min 0.000000e+00 0.000000e+00 0.000000e+00
25% 0.000000e+00 0.000000e+00 0.000000e+00
50% 0.000000e+00 0.000000e+00 0.000000e+00
75% 0.000000e+00 0.000000e+00 1.000000e+00
max 1.000000e+00 1.000000e+00 1.000000e+00
std 3.643400e-01 4.298062e-01 4.781020e-01
# Group by label
dfgrp = pd.DataFrame(df2.groupby(['label']).size())
dfgrp.columns = ['count']
dfgrp = dfgrp.sort_values(by = ['count'], ascending=False)
display(dfgrp)
dfgrp = dfgrp.sort_values(by = ['count'], ascending=True)
# Plot
#plt.barh(dfgrp.index, dfgrp['count'])
#plt.xscale('log')
#plt.xlabel('Flow count (log scale)')
#plt.ylabel('Flow label')
#plt.show()
# Plot
fig1 = px.bar(dfgrp, y=dfgrp.index, x='count', template='plotly_white', orientation='h', \
log_x=True, text='count')
fig1.update_traces(marker_color='black')
fig1.update_xaxes(title="Flow count (log scale)")
fig1.update_yaxes(title="Label")
fig1.update_layout(title={'text': 'Network Flow Labels', 'xanchor': 'left', 'yanchor': 'top'})
fig1.show()
| count | |
|---|---|
| label | |
| BENIGN | 2269630 |
| DoS Hulk | 230124 |
| PortScan | 158798 |
| DDoS | 128025 |
| DoS GoldenEye | 10293 |
| FTP-Patator | 7935 |
| SSH-Patator | 5897 |
| DoS slowloris | 5796 |
| DoS Slowhttptest | 5499 |
| Bot | 1956 |
| Web Attack - Brute Force | 1507 |
| Web Attack - XSS | 652 |
| Infiltration | 36 |
| Web Attack - Sql Injection | 21 |
| Heartbleed | 11 |
# Principle component analysis
# Create the PCA instance
pca = PCA()
# Filter out quantitative fields
numFields = []
objFields = []
for i in range(0, df2.shape[1]):
t = df2.dtypes[i]
if t == 'int64' or t == 'float64' or t == 'uint8':
numFields.append(df2.columns[i])
else:
objFields.append(df2.columns[i])
dfnum = df2[numFields]
dfobj = df2[objFields]
print("Numeric fields:", dfnum.shape[1])
print("Non-numeric fields:", dfobj.shape[1])
# PCA
pca.fit(dfnum)
Numeric fields: 84 Non-numeric fields: 6
PCA()
# PCA summary
print('Components:')
print(pca.components_)
print()
print('Explained variance %:')
cumsum = 0
for i in range(0, len(pca.explained_variance_)):
curr = pca.explained_variance_[i] / pca.explained_variance_.sum()
cumsum += curr
print(i, round(curr, 5), round(cumsum, 5))
print()
# Transform data and reapply the index of the original dataframe
dfnum2 = pd.DataFrame(pca.transform(dfnum))
dfnum2.index = dfnum.index
print('Transformed data:')
print(dfnum2)
print()
Components:
[[ 2.61696214e-05 -3.43866949e-05 4.35609305e-01 ... -3.58671939e-10
1.66598117e-09 -1.50112192e-09]
[ 9.12254632e-07 -2.06741368e-05 3.08128729e-01 ... 8.57131617e-10
-2.55968392e-09 -1.17803428e-09]
[-1.07530591e-06 -2.07959795e-07 -5.99300446e-04 ... 2.87217097e-11
-2.80834352e-12 2.96412705e-11]
...
[-0.00000000e+00 2.95293897e-19 2.38783378e-20 ... -2.43017458e-17
1.29172227e-17 1.23155864e-16]
[-0.00000000e+00 -2.45086785e-18 -1.27634737e-20 ... 7.29672922e-17
-3.21689481e-17 3.21218716e-17]
[-0.00000000e+00 5.70672085e-18 4.49849906e-20 ... 1.68401067e-16
-1.73818556e-16 -9.43754316e-17]]
Explained variance %:
0 0.62969 0.62969
1 0.10835 0.73805
2 0.10552 0.84357
3 0.07992 0.92349
4 0.04028 0.96377
5 0.02 0.98377
6 0.00747 0.99124
7 0.00337 0.99461
8 0.00122 0.99583
9 0.00119 0.99702
10 0.00086 0.99788
11 0.00051 0.99839
12 0.00033 0.99872
13 0.00032 0.99904
14 0.00026 0.9993
15 0.00024 0.99953
16 0.00017 0.9997
17 0.0001 0.9998
18 5e-05 0.99986
19 3e-05 0.99989
20 3e-05 0.99992
21 2e-05 0.99994
22 2e-05 0.99996
23 1e-05 0.99998
24 1e-05 0.99999
25 0.0 0.99999
26 0.0 1.0
27 0.0 1.0
28 0.0 1.0
29 0.0 1.0
30 0.0 1.0
31 0.0 1.0
32 0.0 1.0
33 0.0 1.0
34 0.0 1.0
35 0.0 1.0
36 0.0 1.0
37 0.0 1.0
38 0.0 1.0
39 0.0 1.0
40 0.0 1.0
41 0.0 1.0
42 0.0 1.0
43 0.0 1.0
44 0.0 1.0
45 0.0 1.0
46 0.0 1.0
47 0.0 1.0
48 0.0 1.0
49 0.0 1.0
50 0.0 1.0
51 0.0 1.0
52 0.0 1.0
53 0.0 1.0
54 0.0 1.0
55 0.0 1.0
56 0.0 1.0
57 0.0 1.0
58 0.0 1.0
59 0.0 1.0
60 0.0 1.0
61 0.0 1.0
62 0.0 1.0
63 0.0 1.0
64 0.0 1.0
65 0.0 1.0
66 0.0 1.0
67 0.0 1.0
68 0.0 1.0
69 0.0 1.0
70 0.0 1.0
71 0.0 1.0
72 0.0 1.0
73 0.0 1.0
74 0.0 1.0
75 0.0 1.0
76 0.0 1.0
77 0.0 1.0
78 0.0 1.0
79 0.0 1.0
80 0.0 1.0
81 0.0 1.0
82 0.0 1.0
83 0.0 1.0
Transformed data:
0 1 2 3 4 \
0 -3.082614e+07 -2.600130e+06 -31026.723690 2.168173e+06 927177.757188
1 -3.078692e+07 -2.537748e+06 -30529.167812 -1.723815e+06 943089.495008
2 -3.078818e+07 -2.539681e+06 -30543.726412 -1.603095e+06 942606.651025
3 -3.078942e+07 -2.541639e+06 -30558.454647 -1.480881e+06 942108.359429
4 -3.082614e+07 -2.600130e+06 -31026.723689 2.168173e+06 927177.757146
... ... ... ... ... ...
692698 -3.073471e+07 -2.537672e+06 -30582.333894 -1.825405e+06 926524.017554
692699 -3.079927e+07 -2.558037e+06 -30709.461124 -4.576279e+05 937701.770607
692700 -3.079034e+07 -2.543207e+06 -30632.643529 -1.382680e+06 941683.455907
692701 -2.914012e+07 -2.549914e+06 -30085.526755 -1.821772e+06 352328.061513
692702 -3.065616e+07 -2.534094e+06 -30600.016040 -1.828623e+06 897980.716020
5 6 7 8 \
0 -597359.906508 -315107.685702 -167214.497643 63049.482900
1 -603999.375923 -317603.056859 -168169.331773 63528.812482
2 -603813.315899 -317547.371612 -168187.354982 63509.326422
3 -603607.969274 -317473.631010 -168167.692115 63490.050077
4 -597359.906348 -315107.685900 -167214.497813 63049.482760
... ... ... ... ...
692698 -590219.567875 -295042.606879 -158927.593426 55354.046847
692699 -601741.508113 -316490.922755 -167767.045162 63645.029743
692700 -603412.900222 -317369.286943 -168131.369947 63446.471296
692701 -176764.992258 390685.977544 89448.068108 -163857.045385
692702 -566956.131378 -263013.184391 -144774.180319 43886.783758
9 10 11 12 \
0 -183554.961448 -114726.927068 -38794.851954 -285609.648188
1 -183432.181049 -114536.291890 -38308.244617 -278969.294611
2 -183422.650826 -114548.530255 -38274.575283 -279169.690004
3 -183413.130047 -114547.999060 -38273.161663 -279330.556495
4 -183554.961042 -114726.927298 -38794.851977 -285609.648408
... ... ... ... ...
692698 -161378.662201 -101882.066641 -29780.261229 -276608.397763
692699 -182617.267876 -114360.529649 -37873.274867 -277324.263269
692700 -183318.987744 -114498.508108 -38200.707020 -278998.391055
692701 423978.632044 401907.969947 217889.427121 -189694.874687
692702 -131056.567836 -84302.985808 -20442.510268 -263918.513540
13 14 15 16 17 \
0 796.157289 268732.748862 24118.927713 -54104.679808 4068.387359
1 725.876417 257226.227439 21672.352036 -51268.178075 3620.751285
2 727.581612 257519.086399 21713.794187 -51332.778650 3640.966376
3 729.159011 257806.627685 21772.144880 -51397.308138 3652.392189
4 796.157320 268732.748668 24118.927435 -54104.679689 4068.386615
... ... ... ... ... ...
692698 692.997950 255443.071442 25623.603556 -49605.924898 2803.187440
692699 666.475813 252218.135863 19576.959448 -51119.558909 3491.806019
692700 721.050138 257346.188328 21675.090318 -51309.979657 3632.644709
692701 947.039894 258434.056360 76549.584032 -45856.250547 -8171.248630
692702 665.491873 253565.570015 33422.462944 -45875.956062 -925.684620
18 19 20 21 22 \
0 -36316.268157 33874.505726 -1254.149841 32486.382920 -22812.714268
1 -28766.887376 22256.100802 -887.372735 15216.138484 -8382.249218
2 -28959.106508 22530.790456 -896.098237 15622.345379 -8719.826041
3 -29141.659493 22807.398810 -904.852157 16025.544519 -9065.794771
4 -36316.267135 33874.506612 -1254.149815 32486.381909 -22812.715922
... ... ... ... ... ...
692698 -27544.634193 21803.387752 -849.693790 14340.437967 -7876.361425
692699 -28473.813526 21984.667992 -872.558148 14803.854427 -8299.888859
692700 -28939.759290 22532.155070 -905.632379 15635.301393 -8733.198561
692701 9891.740561 20095.609942 -678.967493 16177.751897 -2380.695424
692702 -29203.793363 19704.260841 -785.843396 9534.503352 467.908362
23 24 25 26 27 \
0 814400.110895 24202.821134 23.516128 -787.894790 -143.032405
1 -95274.003495 -9403.731249 58.371118 -5619.975954 -478.363784
2 -74072.335713 -8624.908826 57.388621 -5521.036313 -472.577684
3 -52706.018508 -7835.522873 56.286825 -5394.286004 -456.734676
4 814400.091766 24202.821267 23.516705 -787.893658 -143.029837
... ... ... ... ... ...
692698 -115417.912793 -10005.990042 28.611710 -5466.678039 -358.931197
692699 -106379.866528 -9779.369495 56.654406 -5414.507524 -397.587601
692700 -72591.718831 -8566.068969 56.672261 -5504.409397 -473.872726
692701 -111085.036691 -2534.159586 -230.644589 -13455.495533 154.990467
692702 -115151.011473 -8015.275587 -173.624982 -4892.413145 -670.356372
28 29 30 31 32 \
0 -60.375503 -26020.218741 700.261217 47653.217033 -703.894915
1 -335.700061 6541.366673 950.877370 60421.542259 -2066.141307
2 -332.140161 18335.089743 921.370001 59341.896558 -2106.227525
3 -335.361669 29849.718014 815.868216 52811.241337 -2733.418388
4 -60.382665 -26020.312220 700.249521 47652.121314 -705.001469
... ... ... ... ... ...
692698 -422.563009 -8906.228748 72.004663 -11657.866394 -7068.426678
692699 -347.231209 -537.757635 294.877444 9373.474620 -6378.039151
692700 -320.124030 9812.320896 953.803471 61210.438070 -933.753347
692701 -5978.237629 -8663.509161 23.709275 -11764.495762 -7460.298591
692702 1216.400717 -9218.175041 64.637849 -17113.967887 -7282.425115
33 34 35 36 37 \
0 -229.996290 -11395.685417 -5906.893636 -256.140904 446.891726
1 -274.243874 -12505.302465 -7964.232357 -262.833415 434.395426
2 -247.048637 -12320.544923 -7976.997489 -41.817121 439.158407
3 -268.014559 -5361.782446 -6596.133972 240.756479 322.218433
4 -229.822393 -11394.143653 -5906.496616 -256.128053 446.879175
... ... ... ... ... ...
692698 1316.658812 673.708732 -277.635964 -169.279829 -216.245497
692699 315.356659 16877.727664 -99.315168 17.116463 -43.931489
692700 -376.835881 -14923.080609 -8721.460593 -205.209606 464.902189
692701 1506.611033 299.265139 -517.025838 -155.399553 -144.147640
692702 1676.964973 -3390.196034 -331.782106 -182.393423 -160.919418
38 39 40 41 42 43 \
0 -2.054464 177.335554 -0.554643 -4.311132 -0.626252 -31.236908
1 -3.645981 321.567541 3.045058 34.225732 -1.010377 -48.365763
2 -3.510025 308.644483 2.813903 31.974331 -1.016327 -48.213846
3 -3.077930 268.877859 3.215172 37.195267 -0.726933 -42.428628
4 -2.054305 177.315780 -0.554654 -4.308859 -0.626267 -31.237934
... ... ... ... ... ... ...
692698 -1.061866 73.936985 0.773734 26.872176 -1.477868 4.440719
692699 0.223551 -24.215837 -1.676562 -100.715622 1.220990 39.327565
692700 -3.318409 292.690468 1.815615 22.875028 -1.397769 -44.807712
692701 -1.070183 45.374534 -2.449485 -18.209320 -2.220225 14.300152
692702 -0.434673 25.685410 -0.602453 -14.633451 -0.819497 20.853171
44 45 46 47 48 49 \
0 11.446364 -21.782041 -15.444887 3.685494 -6.401468 -0.602381
1 23.040345 -14.655485 -3.181313 -4.673664 2.405915 -1.402444
2 22.434304 -15.052943 -3.388271 -3.932894 1.918285 -1.463645
3 20.679716 -9.545487 0.777307 -5.705747 4.519644 -1.795126
4 11.445656 -21.780799 -15.442686 3.684524 -6.400082 -0.602283
... ... ... ... ... ... ...
692698 14.593138 3.056836 -7.391701 3.763212 6.274682 -1.098914
692699 42.487167 -34.021920 -67.146537 71.456352 -56.694651 0.913569
692700 16.231186 -12.224793 9.578812 2.366623 -3.123519 -0.380003
692701 38.126133 -7.828856 -40.671216 40.953041 -12.550321 1.153550
692702 8.263276 -19.328316 -37.004556 20.564391 -5.393155 -4.423319
50 51 52 53 54 55 56 \
0 -5.258306 1.983675 2.666146 -0.447386 -0.187091 -0.375653 -0.179373
1 -2.998404 0.010514 0.991102 -0.301459 -0.253833 0.222400 0.137695
2 -3.093179 0.095410 1.091081 -0.299904 -0.252417 0.205441 0.137584
3 -1.775515 -0.381835 0.514145 0.115201 -0.205133 0.208945 0.158195
4 -5.257807 1.983529 2.665911 -0.447314 -0.187073 -0.375652 -0.179373
... ... ... ... ... ... ... ...
692698 4.673736 0.599172 0.103542 -4.419347 0.170463 -0.758975 0.659599
692699 10.316147 -2.807342 -5.061199 -2.244891 0.118375 -0.002018 0.703739
692700 -2.485240 1.016058 -3.577248 0.756544 -0.354560 -1.082876 0.737875
692701 15.158686 0.488574 -2.389666 -7.027204 0.175825 -0.815775 0.680567
692702 -3.360916 2.978245 5.712059 -7.384871 0.187372 -0.874612 0.701451
57 58 59 60 61 62 63 \
0 -0.159337 -0.010789 -0.206134 -0.015029 -0.212392 0.556106 0.067315
1 -0.027452 0.029384 0.157025 0.211265 -0.354250 0.179832 -0.421925
2 -0.025103 0.031594 0.156732 0.205389 -0.352844 0.179481 -0.422733
3 0.028439 0.058633 0.227327 0.239393 -0.356003 0.161794 -0.432011
4 -0.159325 -0.010784 -0.206123 -0.015014 -0.212394 0.556101 0.067314
... ... ... ... ... ... ... ...
692698 -0.209265 -0.299245 -0.390379 -0.417076 -0.018690 -0.073456 -0.213373
692699 -0.228623 -0.341790 -0.311261 -0.022064 0.039511 -0.088341 0.021670
692700 -0.234943 -0.225085 -0.200570 0.013318 1.182761 -0.119136 -0.040145
692701 -0.232671 -0.307656 -0.347872 -0.257251 -0.044682 -0.079677 -0.181765
692702 -0.229887 -0.294077 -0.300024 -0.152890 -0.047224 -0.081942 -0.167239
64 65 66 67 68 69 \
0 0.057710 0.003527 -0.000749 0.001707 2.481156e-06 -1.925049e-09
1 -0.075928 -0.003605 -0.000262 -0.001179 4.941021e-06 -1.023797e-09
2 -0.074153 -0.002988 -0.000271 -0.001219 5.184989e-06 -1.012883e-09
3 -0.074120 -0.002369 -0.000124 -0.001387 5.282047e-06 -1.103412e-09
4 0.057711 0.003527 -0.000749 0.001707 2.481117e-06 -1.925055e-09
... ... ... ... ... ... ...
692698 0.071992 -0.009118 0.000196 -0.000255 2.529807e-06 -2.284447e-10
692699 0.023521 -0.000610 0.000052 0.000326 -8.856551e-07 -4.858633e-10
692700 -0.017637 0.000543 0.000821 -0.002551 -9.834296e-06 -3.998986e-10
692701 0.054363 -0.008075 -0.000769 -0.000193 3.626718e-06 -5.298656e-10
692702 0.043292 -0.007425 0.000204 -0.000238 2.877133e-06 -2.007525e-10
70 71 72 73 74 \
0 1.230194e-08 3.516241e-10 2.165487e-09 -1.829363e-10 -2.743268e-10
1 1.211693e-08 9.431437e-10 2.500739e-09 -1.573568e-10 -2.507369e-10
2 1.211880e-08 9.259485e-10 2.500205e-09 -1.574829e-10 -2.517139e-10
3 1.136397e-08 9.044622e-10 2.558699e-09 -1.635154e-10 -2.515165e-10
4 1.230176e-08 3.516243e-10 2.165501e-09 -1.829381e-10 -2.743263e-10
... ... ... ... ... ...
692698 1.010741e-08 8.962695e-10 2.650598e-09 -1.784907e-10 -2.431961e-10
692699 8.410760e-09 7.005964e-10 2.500253e-09 -1.545035e-10 -2.589572e-10
692700 1.228019e-08 8.877594e-10 2.420016e-09 -1.568342e-10 -2.531723e-10
692701 9.446566e-09 8.618241e-10 2.411556e-09 -1.909516e-10 -2.266164e-10
692702 1.041642e-08 8.763309e-10 2.559252e-09 -1.707907e-10 -2.435998e-10
75 76 77 78 79 \
0 -2.834910e-09 -8.015333e-10 1.137580e-10 2.717402e-13 -3.760759e-12
1 -3.075818e-09 -7.435362e-10 1.641667e-10 2.803643e-13 -3.860477e-12
2 -3.069963e-09 -7.452518e-10 1.630312e-10 2.710759e-13 -3.839352e-12
3 -3.037866e-09 -7.462920e-10 1.618906e-10 2.236751e-13 -3.742953e-12
4 -2.834903e-09 -8.015330e-10 1.137580e-10 2.717123e-13 -3.760692e-12
... ... ... ... ... ...
692698 -2.992568e-09 -7.376213e-10 1.645831e-10 6.514546e-14 -3.434730e-12
692699 -3.027277e-09 -7.573691e-10 1.475397e-10 6.614290e-14 -3.361693e-12
692700 -3.070139e-09 -7.502390e-10 1.602251e-10 3.144676e-13 -3.915861e-12
692701 -2.944808e-09 -7.028006e-10 1.403926e-10 4.995858e-14 -3.404274e-12
692702 -3.026402e-09 -7.362897e-10 1.637708e-10 6.496810e-14 -3.443207e-12
80 81 82 83
0 2.501400e-12 2.089881e-13 -1.689769e-12 3.109255e-13
1 2.285121e-12 2.180308e-13 -1.906283e-12 3.067186e-13
2 2.275299e-12 2.230018e-13 -1.904407e-12 3.030477e-13
3 2.267880e-12 2.261445e-13 -1.878574e-12 2.469024e-13
4 2.501375e-12 2.089786e-13 -1.689795e-12 3.109170e-13
... ... ... ... ...
692698 2.279951e-12 1.825018e-13 -1.776949e-12 1.020557e-14
692699 2.295071e-12 1.951663e-13 -1.750706e-12 3.750200e-14
692700 2.303671e-12 2.260045e-13 -1.877610e-12 3.274979e-13
692701 2.157672e-12 2.127035e-13 -1.705541e-12 3.548540e-14
692702 2.269897e-12 1.762781e-13 -1.771518e-12 1.428569e-14
[2826180 rows x 84 columns]
# KNN modeling prep
# KNN parameters to iterate through
knn_vals = [3, 6, 9, 12]
knn_weights = ['uniform', 'distance']
knn_leaf_sizes = [15, 20]
# Max features to use (5 features explain 96% of variance)
max_feat = 5
# Scoring method to use for model runs
scoring_method = ['f1_macro', 'f1_micro', 'f1_weighted', 'roc_auc_ovr', 'roc_auc_ovr_weighted']
refit_param = 'roc_auc_ovr'
# Cross-validation folds
cv_folds = 5
# The first 14 components explain 99.9% of variability;
# append the first 14 PCA columns to the non-numerical dataframe
df3 = pd.concat([dfobj, dfnum2.iloc[:,0:(max_feat + 1)]], axis=1)
# Get the dependent variable
y = df3['label']
# Drop the dependent variable ("label"), along with other variables that don't make sense to use in the model
x = df3.drop(['label', 'flow_id', 'srcip', 'dstip', 'timestamp', 'fn'], axis=1)
# Change the names of PCA columns to be strings so that python doesn't complain
x.columns = x.columns.astype(str)
for i in range(5, x.shape[1]):
x.rename({x.columns[i]: 'pca' + x.columns[i]}, axis = 1, inplace = True)
# Split into training and test sets
(x_train, x_test, y_train, y_test) = train_test_split(x, y, test_size=0.2, random_state=777, stratify=y)
# Standardize by subtracting the mean and scaling (i.e. dividing by stdev)
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
from sklearn.metrics import SCORERS
print(sorted(SCORERS.keys()))
print(sklearn.__version__)
['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted', 'max_error', 'mutual_info_score', 'neg_brier_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_gamma_deviance', 'neg_mean_poisson_deviance', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'neg_root_mean_squared_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'rand_score', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'roc_auc_ovo', 'roc_auc_ovo_weighted', 'roc_auc_ovr', 'roc_auc_ovr_weighted', 'top_k_accuracy', 'v_measure_score'] 1.0.2
# TEST !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# KNN modeling - this takes a long time
# TEST !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# Init results array
r_test = []
max_feat_test = 2
knn_vals_test = [3, 9]
knn_weights_test = ['distance']
knn_leaf_sizes_test = [15]
cv_folds_test = 2
#scoring_method_test = 'f1_weighted'
scoring_method_test = ['f1_macro', 'f1_micro', 'f1_weighted', 'roc_auc_ovr', 'roc_auc_ovr_weighted']
#refit_param_test = False
refit_param_test = 'roc_auc_ovr'
# Iterate from 2 to 14 to see how using fewer features improves
# model speed while maintaining performance
for feat in range(2, (max_feat_test + 1)):
# Start timer
print("Modeling with", feat, "PCA columns")
print()
t1 = time.time()
# Choose only the first [feat] columns to train with
x_train2 = x_train[:,0:feat]
# Parameters to vary using grid search
# (from https://towardsdatascience.com/gridsearchcv-for-beginners-db48a90114ee)
params = [{'n_neighbors': knn_vals_test, \
'weights': knn_weights_test, \
'leaf_size': knn_leaf_sizes_test}]
# Instantiate knn
knn = KNeighborsClassifier()
# Grid search using knn params
gs_knn = GridSearchCV(knn, param_grid=params, scoring=scoring_method_test, cv=cv_folds_test, verbose=3, \
refit=refit_param_test)
gs_knn.fit(x_train2, y_train)
print()
# Predict malware class on test cases
print("Predicting malware class on test cases")
y_pred = gs_knn.predict(x_test[:,0:feat])
cm = confusion_matrix(y_test, y_pred, labels=np.unique(y_pred))
cr = classification_report(y_test, y_pred, output_dict=True, labels=np.unique(y_pred))
print()
# Append resutls to array
print("Scoring and appending results")
r_test.append( \
{ \
'feat': feat, \
'knn_k': gs_knn.best_params_['n_neighbors'], \
'knn_weights': gs_knn.best_params_['weights'], \
'knn_leaf_size': gs_knn.best_params_['leaf_size'], \
'score': gs_knn.score(x_train2, y_train), \
'cm': cm.tolist(), \
'cr': cr, \
't': (time.time() - t1), \
'cvr': gs_knn.cv_results_ \
} \
)
print()
print("Done fitting with KNN")
Modeling with 2 PCA columns Fitting 2 folds for each of 2 candidates, totalling 4 fits [CV 1/2] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.732) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.908) roc_auc_ovr_weighted: (test=0.987) total time= 1.1min [CV 2/2] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.736) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.901) roc_auc_ovr_weighted: (test=0.987) total time= 1.1min [CV 1/2] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.742) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.993) total time= 1.1min [CV 2/2] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.729) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.926) roc_auc_ovr_weighted: (test=0.993) total time= 1.2min Predicting malware class on test cases Scoring and appending results Done fitting with KNN
# TEST !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# Results
# TEST !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
e = r_test[0]
print(e['cvr'])
#print(multilabel_confusion_matrix(y_test, y_pred, labels=np.unique(y_pred)))
#print(confusion_matrix(y_test, y_pred, labels=np.unique(y_pred)))
for e in r_test:
for k in e.keys():
print(k, type(e[k]))
fh = open('out.json', 'w')
fh.write(str(r_test))
fh.close()
{'mean_fit_time': array([4.7936244 , 4.49001873]), 'std_fit_time': array([0.01810884, 0.11299288]), 'mean_score_time': array([60.80041277, 64.77530456]), 'std_score_time': array([1.17025292, 2.63501811]), 'param_leaf_size': masked_array(data=[15, 15],
mask=[False, False],
fill_value='?',
dtype=object), 'param_n_neighbors': masked_array(data=[3, 9],
mask=[False, False],
fill_value='?',
dtype=object), 'param_weights': masked_array(data=['distance', 'distance'],
mask=[False, False],
fill_value='?',
dtype=object), 'params': [{'leaf_size': 15, 'n_neighbors': 3, 'weights': 'distance'}, {'leaf_size': 15, 'n_neighbors': 9, 'weights': 'distance'}], 'split0_test_f1_macro': array([0.73207083, 0.74229839]), 'split1_test_f1_macro': array([0.73608658, 0.72889469]), 'mean_test_f1_macro': array([0.7340787 , 0.73559654]), 'std_test_f1_macro': array([0.00200788, 0.00670185]), 'rank_test_f1_macro': array([2, 1]), 'split0_test_f1_micro': array([0.98058156, 0.98109551]), 'split1_test_f1_micro': array([0.98066737, 0.98107162]), 'mean_test_f1_micro': array([0.98062446, 0.98108357]), 'std_test_f1_micro': array([4.29024337e-05, 1.19419145e-05]), 'rank_test_f1_micro': array([2, 1]), 'split0_test_f1_weighted': array([0.98060531, 0.9810909 ]), 'split1_test_f1_weighted': array([0.980695 , 0.9810743]), 'mean_test_f1_weighted': array([0.98065016, 0.9810826 ]), 'std_test_f1_weighted': array([4.48413417e-05, 8.30130612e-06]), 'rank_test_f1_weighted': array([2, 1]), 'split0_test_roc_auc_ovr': array([0.90764561, 0.93509814]), 'split1_test_roc_auc_ovr': array([0.90137343, 0.92600372]), 'mean_test_roc_auc_ovr': array([0.90450952, 0.93055093]), 'std_test_roc_auc_ovr': array([0.00313609, 0.00454721]), 'rank_test_roc_auc_ovr': array([2, 1]), 'split0_test_roc_auc_ovr_weighted': array([0.98703227, 0.99250174]), 'split1_test_roc_auc_ovr_weighted': array([0.98716332, 0.99254448]), 'mean_test_roc_auc_ovr_weighted': array([0.98709779, 0.99252311]), 'std_test_roc_auc_ovr_weighted': array([6.55263003e-05, 2.13718427e-05]), 'rank_test_roc_auc_ovr_weighted': array([2, 1])}
feat <class 'int'>
knn_k <class 'int'>
knn_weights <class 'str'>
knn_leaf_size <class 'int'>
score <class 'numpy.float64'>
cm <class 'list'>
cr <class 'dict'>
t <class 'float'>
cvr <class 'dict'>
# KNN modeling - this takes a long time
# Init results array
r = []
# Iterate from 2 to max_feat to see how using fewer features improves
# model speed while maintaining performance
for feat in range(2, (max_feat + 1)):
# Start timer
print("Modeling with", feat, "PCA columns")
print()
t1 = time.time()
# Choose only the first [feat] columns to train with
x_train2 = x_train[:,0:feat]
# Parameters to vary using grid search
# (from https://towardsdatascience.com/gridsearchcv-for-beginners-db48a90114ee)
params = [{'n_neighbors': knn_vals, \
'weights': knn_weights, \
'leaf_size': knn_leaf_sizes}]
# Instantiate knn
knn = KNeighborsClassifier()
# Grid search using knn params
gs_knn = GridSearchCV(knn, param_grid=params, scoring=scoring_method, cv=cv_folds, verbose=3, refit=refit_param)
gs_knn.fit(x_train2, y_train)
print()
# Predict malware class on test cases
print("Predicting malware class on test cases")
y_pred = gs_knn.predict(x_test[:,0:feat])
cm = confusion_matrix(y_test, y_pred, labels=np.unique(y_pred))
cr = classification_report(y_test, y_pred, output_dict=True, labels=np.unique(y_pred))
print()
# Append resutls to array
print("Scoring and appending results")
r.append( \
{ \
'feat': feat, \
'knn_k': gs_knn.best_params_['n_neighbors'], \
'knn_weights': gs_knn.best_params_['weights'], \
'knn_leaf_size': gs_knn.best_params_['leaf_size'], \
'score': gs_knn.score(x_train2, y_train), \
'cm': cm.tolist(), \
'cr': cr, \
't': (time.time() - t1), \
'cvr': gs_knn.cv_results_ \
} \
)
print()
fh = open('out.json', 'w')
fh.write(str(r))
fh.close()
print("Done fitting with KNN")
Modeling with 2 PCA columns Fitting 5 folds for each of 16 candidates, totalling 80 fits [CV 1/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.716) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.917) roc_auc_ovr_weighted: (test=0.988) total time= 34.9s [CV 2/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.721) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.923) roc_auc_ovr_weighted: (test=0.988) total time= 35.2s [CV 3/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.749) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.925) roc_auc_ovr_weighted: (test=0.988) total time= 34.3s [CV 4/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.675) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.884) roc_auc_ovr_weighted: (test=0.988) total time= 33.9s [CV 5/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.745) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.920) roc_auc_ovr_weighted: (test=0.988) total time= 33.7s [CV 1/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.767) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.917) roc_auc_ovr_weighted: (test=0.988) total time= 28.8s [CV 2/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.726) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.923) roc_auc_ovr_weighted: (test=0.988) total time= 28.8s [CV 3/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.753) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.925) roc_auc_ovr_weighted: (test=0.988) total time= 28.8s [CV 4/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.725) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.884) roc_auc_ovr_weighted: (test=0.988) total time= 30.7s [CV 5/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.743) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.920) roc_auc_ovr_weighted: (test=0.988) total time= 30.1s [CV 1/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.703) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.929) roc_auc_ovr_weighted: (test=0.992) total time= 35.2s [CV 2/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.713) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.992) total time= 35.2s [CV 3/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.709) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.992) total time= 35.8s [CV 4/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.660) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.908) roc_auc_ovr_weighted: (test=0.992) total time= 35.7s [CV 5/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.723) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.992) total time= 34.3s [CV 1/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.774) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.929) roc_auc_ovr_weighted: (test=0.992) total time= 30.8s [CV 2/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.728) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.992) total time= 35.1s [CV 3/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.731) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.992) total time= 32.2s [CV 4/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.701) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.908) roc_auc_ovr_weighted: (test=0.992) total time= 31.2s [CV 5/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.992) total time= 31.6s [CV 1/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.707) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.993) total time= 38.5s [CV 2/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.659) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.951) roc_auc_ovr_weighted: (test=0.993) total time= 37.1s [CV 3/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.651) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.938) roc_auc_ovr_weighted: (test=0.993) total time= 35.9s [CV 4/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.654) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.912) roc_auc_ovr_weighted: (test=0.993) total time= 37.0s [CV 5/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.724) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.993) total time= 36.6s [CV 1/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.772) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.993) total time= 36.1s [CV 2/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.727) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.951) roc_auc_ovr_weighted: (test=0.993) total time= 34.8s [CV 3/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.728) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.938) roc_auc_ovr_weighted: (test=0.993) total time= 32.3s [CV 4/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.701) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.912) roc_auc_ovr_weighted: (test=0.993) total time= 30.4s [CV 5/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.742) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.993) total time= 31.6s [CV 1/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.646) f1_micro: (test=0.980) f1_weighted: (test=0.979) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.994) total time= 37.1s [CV 2/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.645) f1_micro: (test=0.979) f1_weighted: (test=0.979) roc_auc_ovr: (test=0.952) roc_auc_ovr_weighted: (test=0.994) total time= 36.3s [CV 3/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.639) f1_micro: (test=0.980) f1_weighted: (test=0.979) roc_auc_ovr: (test=0.940) roc_auc_ovr_weighted: (test=0.994) total time= 36.0s [CV 4/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.638) f1_micro: (test=0.980) f1_weighted: (test=0.979) roc_auc_ovr: (test=0.914) roc_auc_ovr_weighted: (test=0.994) total time= 37.2s [CV 5/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.641) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.943) roc_auc_ovr_weighted: (test=0.994) total time= 35.6s [CV 1/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.751) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.994) total time= 31.0s [CV 2/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.741) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.952) roc_auc_ovr_weighted: (test=0.994) total time= 31.8s [CV 3/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.728) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.940) roc_auc_ovr_weighted: (test=0.994) total time= 34.6s [CV 4/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.699) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.914) roc_auc_ovr_weighted: (test=0.994) total time= 31.8s [CV 5/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.741) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.994) total time= 30.5s [CV 1/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.716) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.917) roc_auc_ovr_weighted: (test=0.988) total time= 37.3s [CV 2/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.721) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.923) roc_auc_ovr_weighted: (test=0.988) total time= 34.4s [CV 3/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.749) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.925) roc_auc_ovr_weighted: (test=0.988) total time= 34.0s [CV 4/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.675) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.884) roc_auc_ovr_weighted: (test=0.988) total time= 35.3s [CV 5/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.745) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.920) roc_auc_ovr_weighted: (test=0.988) total time= 35.5s [CV 1/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.767) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.917) roc_auc_ovr_weighted: (test=0.988) total time= 29.1s [CV 2/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.726) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.923) roc_auc_ovr_weighted: (test=0.988) total time= 29.3s [CV 3/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.753) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.925) roc_auc_ovr_weighted: (test=0.988) total time= 30.8s [CV 4/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.725) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.884) roc_auc_ovr_weighted: (test=0.988) total time= 29.0s [CV 5/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.743) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.920) roc_auc_ovr_weighted: (test=0.988) total time= 28.3s [CV 1/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.703) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.929) roc_auc_ovr_weighted: (test=0.992) total time= 34.8s [CV 2/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.713) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.992) total time= 38.0s [CV 3/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.709) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.992) total time= 35.8s [CV 4/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.660) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.908) roc_auc_ovr_weighted: (test=0.992) total time= 34.7s [CV 5/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.723) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.992) total time= 36.4s [CV 1/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.774) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.929) roc_auc_ovr_weighted: (test=0.992) total time= 29.8s [CV 2/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.728) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.992) total time= 31.4s [CV 3/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.731) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.992) total time= 29.6s [CV 4/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.701) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.908) roc_auc_ovr_weighted: (test=0.992) total time= 30.3s [CV 5/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.992) total time= 30.5s [CV 1/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.707) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.993) total time= 35.2s [CV 2/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.659) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.951) roc_auc_ovr_weighted: (test=0.993) total time= 35.1s [CV 3/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.651) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.938) roc_auc_ovr_weighted: (test=0.993) total time= 37.0s [CV 4/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.654) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.912) roc_auc_ovr_weighted: (test=0.993) total time= 36.4s [CV 5/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.724) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.993) total time= 34.7s [CV 1/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.772) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.993) total time= 30.9s [CV 2/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.727) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.951) roc_auc_ovr_weighted: (test=0.993) total time= 31.0s [CV 3/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.728) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.938) roc_auc_ovr_weighted: (test=0.993) total time= 30.3s [CV 4/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.701) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.912) roc_auc_ovr_weighted: (test=0.993) total time= 30.2s [CV 5/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.742) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.993) total time= 30.6s [CV 1/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.646) f1_micro: (test=0.980) f1_weighted: (test=0.979) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.994) total time= 37.3s [CV 2/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.645) f1_micro: (test=0.979) f1_weighted: (test=0.979) roc_auc_ovr: (test=0.952) roc_auc_ovr_weighted: (test=0.994) total time= 35.9s [CV 3/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.639) f1_micro: (test=0.980) f1_weighted: (test=0.979) roc_auc_ovr: (test=0.940) roc_auc_ovr_weighted: (test=0.994) total time= 36.3s [CV 4/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.638) f1_micro: (test=0.980) f1_weighted: (test=0.979) roc_auc_ovr: (test=0.914) roc_auc_ovr_weighted: (test=0.994) total time= 38.0s [CV 5/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.641) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.943) roc_auc_ovr_weighted: (test=0.994) total time= 35.8s [CV 1/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.751) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.994) total time= 30.7s [CV 2/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.741) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.952) roc_auc_ovr_weighted: (test=0.994) total time= 32.3s [CV 3/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.728) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.940) roc_auc_ovr_weighted: (test=0.994) total time= 32.1s [CV 4/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.699) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.914) roc_auc_ovr_weighted: (test=0.994) total time= 31.0s [CV 5/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.741) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.994) total time= 30.2s Predicting malware class on test cases Scoring and appending results Modeling with 3 PCA columns Fitting 5 folds for each of 16 candidates, totalling 80 fits [CV 1/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.718) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.918) roc_auc_ovr_weighted: (test=0.990) total time= 37.2s [CV 2/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.723) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.925) roc_auc_ovr_weighted: (test=0.989) total time= 38.5s [CV 3/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.752) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.926) roc_auc_ovr_weighted: (test=0.990) total time= 37.6s [CV 4/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.677) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.885) roc_auc_ovr_weighted: (test=0.990) total time= 36.9s [CV 5/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.747) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.922) roc_auc_ovr_weighted: (test=0.989) total time= 37.8s [CV 1/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.771) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.918) roc_auc_ovr_weighted: (test=0.990) total time= 32.5s [CV 2/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.729) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.925) roc_auc_ovr_weighted: (test=0.989) total time= 31.7s [CV 3/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.926) roc_auc_ovr_weighted: (test=0.990) total time= 31.7s [CV 4/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.729) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.885) roc_auc_ovr_weighted: (test=0.990) total time= 32.7s [CV 5/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.747) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.922) roc_auc_ovr_weighted: (test=0.989) total time= 31.5s [CV 1/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.705) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.930) roc_auc_ovr_weighted: (test=0.993) total time= 37.9s [CV 2/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.716) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.993) total time= 39.5s [CV 3/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.711) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.993) total time= 39.2s [CV 4/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.661) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.909) roc_auc_ovr_weighted: (test=0.993) total time= 38.0s [CV 5/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.725) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.933) roc_auc_ovr_weighted: (test=0.993) total time= 38.7s [CV 1/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.778) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.930) roc_auc_ovr_weighted: (test=0.993) total time= 34.1s [CV 2/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.731) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.993) total time= 33.0s [CV 3/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.734) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.993) total time= 33.1s [CV 4/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.705) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.909) roc_auc_ovr_weighted: (test=0.993) total time= 34.1s [CV 5/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.747) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.933) roc_auc_ovr_weighted: (test=0.993) total time= 32.3s [CV 1/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.709) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.933) roc_auc_ovr_weighted: (test=0.994) total time= 38.7s [CV 2/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.662) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.952) roc_auc_ovr_weighted: (test=0.994) total time= 40.2s [CV 3/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.654) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.994) total time= 39.0s [CV 4/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.656) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.913) roc_auc_ovr_weighted: (test=0.994) total time= 39.1s [CV 5/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.726) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.942) roc_auc_ovr_weighted: (test=0.994) total time= 39.8s [CV 1/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.775) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.933) roc_auc_ovr_weighted: (test=0.994) total time= 34.8s [CV 2/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.730) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.952) roc_auc_ovr_weighted: (test=0.994) total time= 34.1s [CV 3/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.732) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.994) total time= 34.3s [CV 4/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.705) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.913) roc_auc_ovr_weighted: (test=0.994) total time= 35.6s [CV 5/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.745) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.942) roc_auc_ovr_weighted: (test=0.994) total time= 33.6s [CV 1/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.649) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.994) total time= 39.6s [CV 2/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.648) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.953) roc_auc_ovr_weighted: (test=0.994) total time= 42.7s [CV 3/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.641) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.995) total time= 40.5s [CV 4/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.641) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.915) roc_auc_ovr_weighted: (test=0.995) total time= 39.9s [CV 5/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.643) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.994) total time= 41.4s [CV 1/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.755) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.995) total time= 34.9s [CV 2/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.953) roc_auc_ovr_weighted: (test=0.995) total time= 34.8s [CV 3/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.731) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.995) total time= 36.3s [CV 4/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.702) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.915) roc_auc_ovr_weighted: (test=0.995) total time= 35.9s [CV 5/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.743) f1_micro: (test=0.984) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.945) roc_auc_ovr_weighted: (test=0.995) total time= 34.6s [CV 1/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.718) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.918) roc_auc_ovr_weighted: (test=0.990) total time= 37.7s [CV 2/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.723) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.925) roc_auc_ovr_weighted: (test=0.989) total time= 38.7s [CV 3/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.752) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.926) roc_auc_ovr_weighted: (test=0.990) total time= 37.1s [CV 4/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.677) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.885) roc_auc_ovr_weighted: (test=0.990) total time= 37.7s [CV 5/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.747) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.922) roc_auc_ovr_weighted: (test=0.989) total time= 38.4s [CV 1/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.771) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.918) roc_auc_ovr_weighted: (test=0.990) total time= 31.4s [CV 2/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.729) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.925) roc_auc_ovr_weighted: (test=0.989) total time= 31.6s [CV 3/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.926) roc_auc_ovr_weighted: (test=0.990) total time= 33.1s [CV 4/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.729) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.885) roc_auc_ovr_weighted: (test=0.990) total time= 32.9s [CV 5/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.747) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.922) roc_auc_ovr_weighted: (test=0.989) total time= 31.1s [CV 1/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.705) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.930) roc_auc_ovr_weighted: (test=0.993) total time= 38.3s [CV 2/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.716) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.993) total time= 38.7s [CV 3/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.711) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.993) total time= 38.0s [CV 4/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.661) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.909) roc_auc_ovr_weighted: (test=0.993) total time= 38.4s [CV 5/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.725) f1_micro: (test=0.982) f1_weighted: (test=0.982) roc_auc_ovr: (test=0.933) roc_auc_ovr_weighted: (test=0.993) total time= 38.5s [CV 1/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.778) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.930) roc_auc_ovr_weighted: (test=0.993) total time= 32.5s [CV 2/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.731) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.993) total time= 33.1s [CV 3/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.734) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.993) total time= 34.0s [CV 4/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.705) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.909) roc_auc_ovr_weighted: (test=0.993) total time= 34.1s [CV 5/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.747) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.933) roc_auc_ovr_weighted: (test=0.993) total time= 32.3s [CV 1/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.709) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.933) roc_auc_ovr_weighted: (test=0.994) total time= 39.7s [CV 2/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.662) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.952) roc_auc_ovr_weighted: (test=0.994) total time= 40.8s [CV 3/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.654) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.994) total time= 38.8s [CV 4/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.656) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.913) roc_auc_ovr_weighted: (test=0.994) total time= 39.7s [CV 5/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.726) f1_micro: (test=0.981) f1_weighted: (test=0.981) roc_auc_ovr: (test=0.942) roc_auc_ovr_weighted: (test=0.994) total time= 40.5s [CV 1/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.775) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.933) roc_auc_ovr_weighted: (test=0.994) total time= 34.0s [CV 2/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.730) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.952) roc_auc_ovr_weighted: (test=0.994) total time= 34.2s [CV 3/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.732) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.994) total time= 35.8s [CV 4/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.705) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.913) roc_auc_ovr_weighted: (test=0.994) total time= 34.6s [CV 5/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.745) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.942) roc_auc_ovr_weighted: (test=0.994) total time= 33.4s [CV 1/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.649) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.994) total time= 41.1s [CV 2/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.648) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.953) roc_auc_ovr_weighted: (test=0.994) total time= 41.2s [CV 3/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.641) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.995) total time= 39.8s [CV 4/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.641) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.915) roc_auc_ovr_weighted: (test=0.995) total time= 41.9s [CV 5/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.643) f1_micro: (test=0.980) f1_weighted: (test=0.980) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.994) total time= 40.4s [CV 1/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.755) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.995) total time= 34.5s [CV 2/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.953) roc_auc_ovr_weighted: (test=0.995) total time= 36.0s [CV 3/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.731) f1_micro: (test=0.984) f1_weighted: (test=0.984) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.995) total time= 36.1s [CV 4/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.702) f1_micro: (test=0.983) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.915) roc_auc_ovr_weighted: (test=0.995) total time= 34.8s [CV 5/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.743) f1_micro: (test=0.984) f1_weighted: (test=0.983) roc_auc_ovr: (test=0.945) roc_auc_ovr_weighted: (test=0.995) total time= 34.8s Predicting malware class on test cases Scoring and appending results Modeling with 4 PCA columns Fitting 5 folds for each of 16 candidates, totalling 80 fits [CV 1/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.729) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.923) roc_auc_ovr_weighted: (test=0.994) total time= 40.0s [CV 2/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.736) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.928) roc_auc_ovr_weighted: (test=0.993) total time= 39.3s [CV 3/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.763) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.929) roc_auc_ovr_weighted: (test=0.994) total time= 38.6s [CV 4/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.690) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.889) roc_auc_ovr_weighted: (test=0.994) total time= 39.3s [CV 5/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.759) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.926) roc_auc_ovr_weighted: (test=0.994) total time= 39.0s [CV 1/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.782) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.923) roc_auc_ovr_weighted: (test=0.994) total time= 33.2s [CV 2/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.928) roc_auc_ovr_weighted: (test=0.993) total time= 33.5s [CV 3/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.771) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.929) roc_auc_ovr_weighted: (test=0.994) total time= 34.9s [CV 4/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.742) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.889) roc_auc_ovr_weighted: (test=0.994) total time= 35.0s [CV 5/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.760) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.926) roc_auc_ovr_weighted: (test=0.994) total time= 33.0s [CV 1/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.717) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.995) total time= 41.0s [CV 2/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.730) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.995) total time= 41.6s [CV 3/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.726) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.937) roc_auc_ovr_weighted: (test=0.995) total time= 39.7s [CV 4/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.674) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.912) roc_auc_ovr_weighted: (test=0.995) total time= 41.3s [CV 5/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.739) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.995) total time= 40.7s [CV 1/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.788) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.995) total time= 34.6s [CV 2/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.995) total time= 35.7s [CV 3/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.747) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.937) roc_auc_ovr_weighted: (test=0.996) total time= 35.2s [CV 4/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.717) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.912) roc_auc_ovr_weighted: (test=0.996) total time= 34.9s [CV 5/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.759) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.995) total time= 34.4s [CV 1/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.718) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.996) total time= 41.8s [CV 2/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.672) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.954) roc_auc_ovr_weighted: (test=0.996) total time= 41.2s [CV 3/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.665) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.940) roc_auc_ovr_weighted: (test=0.996) total time= 41.3s [CV 4/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.668) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.916) roc_auc_ovr_weighted: (test=0.996) total time= 42.0s [CV 5/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.715) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.996) total time= 40.8s [CV 1/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.786) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.996) total time= 36.0s [CV 2/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.742) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.954) roc_auc_ovr_weighted: (test=0.996) total time= 37.0s [CV 3/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.745) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.940) roc_auc_ovr_weighted: (test=0.996) total time= 35.9s [CV 4/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.716) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.916) roc_auc_ovr_weighted: (test=0.996) total time= 35.9s [CV 5/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.996) total time= 36.9s [CV 1/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.659) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.937) roc_auc_ovr_weighted: (test=0.996) total time= 42.1s [CV 2/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.658) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.955) roc_auc_ovr_weighted: (test=0.996) total time= 42.2s [CV 3/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.653) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.942) roc_auc_ovr_weighted: (test=0.996) total time= 44.1s [CV 4/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.655) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.997) total time= 42.4s [CV 5/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.657) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.946) roc_auc_ovr_weighted: (test=0.996) total time= 42.1s [CV 1/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.764) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.937) roc_auc_ovr_weighted: (test=0.997) total time= 38.6s [CV 2/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.955) roc_auc_ovr_weighted: (test=0.996) total time= 37.3s [CV 3/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.942) roc_auc_ovr_weighted: (test=0.997) total time= 37.0s [CV 4/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.712) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.997) total time= 39.0s [CV 5/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.755) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.946) roc_auc_ovr_weighted: (test=0.997) total time= 37.2s [CV 1/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.729) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.923) roc_auc_ovr_weighted: (test=0.994) total time= 38.3s [CV 2/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.736) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.928) roc_auc_ovr_weighted: (test=0.993) total time= 40.6s [CV 3/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.763) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.929) roc_auc_ovr_weighted: (test=0.994) total time= 39.5s [CV 4/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.690) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.889) roc_auc_ovr_weighted: (test=0.994) total time= 38.7s [CV 5/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.759) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.926) roc_auc_ovr_weighted: (test=0.994) total time= 40.0s [CV 1/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.782) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.923) roc_auc_ovr_weighted: (test=0.994) total time= 34.3s [CV 2/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.928) roc_auc_ovr_weighted: (test=0.993) total time= 33.4s [CV 3/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.771) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.929) roc_auc_ovr_weighted: (test=0.994) total time= 34.0s [CV 4/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.742) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.889) roc_auc_ovr_weighted: (test=0.994) total time= 34.1s [CV 5/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.760) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.926) roc_auc_ovr_weighted: (test=0.994) total time= 33.0s [CV 1/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.717) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.995) total time= 40.0s [CV 2/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.730) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.995) total time= 42.2s [CV 3/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.726) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.937) roc_auc_ovr_weighted: (test=0.995) total time= 40.3s [CV 4/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.674) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.912) roc_auc_ovr_weighted: (test=0.995) total time= 40.7s [CV 5/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.739) f1_micro: (test=0.988) f1_weighted: (test=0.988) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.995) total time= 41.6s [CV 1/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.788) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.995) total time= 35.0s [CV 2/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.995) total time= 34.9s [CV 3/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.747) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.937) roc_auc_ovr_weighted: (test=0.996) total time= 36.5s [CV 4/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.717) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.912) roc_auc_ovr_weighted: (test=0.996) total time= 35.4s [CV 5/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.759) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.995) total time= 34.4s [CV 1/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.718) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.996) total time= 42.0s [CV 2/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.672) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.954) roc_auc_ovr_weighted: (test=0.996) total time= 41.5s [CV 3/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.665) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.940) roc_auc_ovr_weighted: (test=0.996) total time= 40.9s [CV 4/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.668) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.916) roc_auc_ovr_weighted: (test=0.996) total time= 42.9s [CV 5/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.715) f1_micro: (test=0.987) f1_weighted: (test=0.987) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.996) total time= 41.5s [CV 1/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.786) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.996) total time= 35.8s [CV 2/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.742) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.954) roc_auc_ovr_weighted: (test=0.996) total time= 37.5s [CV 3/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.745) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.940) roc_auc_ovr_weighted: (test=0.996) total time= 36.9s [CV 4/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.716) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.916) roc_auc_ovr_weighted: (test=0.996) total time= 36.1s [CV 5/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.996) total time= 36.6s [CV 1/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.659) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.937) roc_auc_ovr_weighted: (test=0.996) total time= 43.3s [CV 2/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.658) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.955) roc_auc_ovr_weighted: (test=0.996) total time= 42.3s [CV 3/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.653) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.942) roc_auc_ovr_weighted: (test=0.996) total time= 43.1s [CV 4/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.655) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.997) total time= 42.0s [CV 5/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.657) f1_micro: (test=0.986) f1_weighted: (test=0.986) roc_auc_ovr: (test=0.946) roc_auc_ovr_weighted: (test=0.996) total time= 41.7s [CV 1/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.764) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.937) roc_auc_ovr_weighted: (test=0.997) total time= 38.2s [CV 2/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.955) roc_auc_ovr_weighted: (test=0.996) total time= 38.8s [CV 3/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.744) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.942) roc_auc_ovr_weighted: (test=0.997) total time= 36.9s [CV 4/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.712) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.934) roc_auc_ovr_weighted: (test=0.997) total time= 38.7s [CV 5/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.755) f1_micro: (test=0.989) f1_weighted: (test=0.989) roc_auc_ovr: (test=0.946) roc_auc_ovr_weighted: (test=0.997) total time= 36.9s Predicting malware class on test cases Scoring and appending results Modeling with 5 PCA columns Fitting 5 folds for each of 16 candidates, totalling 80 fits [CV 1/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.761) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.924) roc_auc_ovr_weighted: (test=0.996) total time= 42.4s [CV 2/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.806) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.996) total time= 41.8s [CV 3/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.787) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.996) total time= 43.1s [CV 4/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.702) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.905) roc_auc_ovr_weighted: (test=0.996) total time= 42.3s [CV 5/5] END leaf_size=15, n_neighbors=3, weights=uniform; f1_macro: (test=0.791) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.996) total time= 40.6s [CV 1/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.810) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.924) roc_auc_ovr_weighted: (test=0.996) total time= 36.3s [CV 2/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.811) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.996) total time= 35.6s [CV 3/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.808) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.996) total time= 35.1s [CV 4/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.741) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.905) roc_auc_ovr_weighted: (test=0.996) total time= 37.0s [CV 5/5] END leaf_size=15, n_neighbors=3, weights=distance; f1_macro: (test=0.806) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.996) total time= 36.2s [CV 1/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.722) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.997) total time= 42.0s [CV 2/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.748) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.997) total time= 44.2s [CV 3/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.731) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.997) total time= 43.1s [CV 4/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.687) f1_micro: (test=0.992) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.914) roc_auc_ovr_weighted: (test=0.997) total time= 42.5s [CV 5/5] END leaf_size=15, n_neighbors=6, weights=uniform; f1_macro: (test=0.754) f1_micro: (test=0.992) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.957) roc_auc_ovr_weighted: (test=0.997) total time= 44.1s [CV 1/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.812) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.997) total time= 37.3s [CV 2/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.997) total time= 37.4s [CV 3/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.801) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.997) total time= 38.6s [CV 4/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.728) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.914) roc_auc_ovr_weighted: (test=0.997) total time= 37.8s [CV 5/5] END leaf_size=15, n_neighbors=6, weights=distance; f1_macro: (test=0.790) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.957) roc_auc_ovr_weighted: (test=0.997) total time= 36.8s [CV 1/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.717) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.938) roc_auc_ovr_weighted: (test=0.997) total time= 45.5s [CV 2/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.690) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.945) roc_auc_ovr_weighted: (test=0.998) total time= 44.6s [CV 3/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.682) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.943) roc_auc_ovr_weighted: (test=0.997) total time= 44.0s [CV 4/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.683) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.918) roc_auc_ovr_weighted: (test=0.998) total time= 46.2s [CV 5/5] END leaf_size=15, n_neighbors=9, weights=uniform; f1_macro: (test=0.733) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.960) roc_auc_ovr_weighted: (test=0.998) total time= 43.2s [CV 1/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.820) f1_micro: (test=0.993) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.938) roc_auc_ovr_weighted: (test=0.997) total time= 39.0s [CV 2/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.755) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.945) roc_auc_ovr_weighted: (test=0.998) total time= 39.3s [CV 3/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.787) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.943) roc_auc_ovr_weighted: (test=0.998) total time= 38.6s [CV 4/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.725) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.918) roc_auc_ovr_weighted: (test=0.998) total time= 39.5s [CV 5/5] END leaf_size=15, n_neighbors=9, weights=distance; f1_macro: (test=0.791) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.960) roc_auc_ovr_weighted: (test=0.998) total time= 38.7s [CV 1/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.677) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.998) total time= 44.7s [CV 2/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.679) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.958) roc_auc_ovr_weighted: (test=0.998) total time= 46.4s [CV 3/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.671) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.998) total time= 46.1s [CV 4/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.673) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.998) total time= 45.0s [CV 5/5] END leaf_size=15, n_neighbors=12, weights=uniform; f1_macro: (test=0.676) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.962) roc_auc_ovr_weighted: (test=0.998) total time= 45.7s [CV 1/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.818) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.998) total time= 39.6s [CV 2/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.958) roc_auc_ovr_weighted: (test=0.998) total time= 40.1s [CV 3/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.787) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.998) total time= 42.2s [CV 4/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.723) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.998) total time= 40.4s [CV 5/5] END leaf_size=15, n_neighbors=12, weights=distance; f1_macro: (test=0.770) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.962) roc_auc_ovr_weighted: (test=0.998) total time= 39.8s [CV 1/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.761) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.924) roc_auc_ovr_weighted: (test=0.996) total time= 42.8s [CV 2/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.806) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.996) total time= 41.1s [CV 3/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.787) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.996) total time= 40.9s [CV 4/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.702) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.905) roc_auc_ovr_weighted: (test=0.996) total time= 42.9s [CV 5/5] END leaf_size=20, n_neighbors=3, weights=uniform; f1_macro: (test=0.791) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.996) total time= 40.5s [CV 1/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.810) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.924) roc_auc_ovr_weighted: (test=0.996) total time= 35.8s [CV 2/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.811) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.996) total time= 36.5s [CV 3/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.808) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.932) roc_auc_ovr_weighted: (test=0.996) total time= 35.5s [CV 4/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.741) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.905) roc_auc_ovr_weighted: (test=0.996) total time= 35.6s [CV 5/5] END leaf_size=20, n_neighbors=3, weights=distance; f1_macro: (test=0.806) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.996) total time= 36.6s [CV 1/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.722) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.997) total time= 43.0s [CV 2/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.748) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.997) total time= 42.5s [CV 3/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.731) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.997) total time= 45.0s [CV 4/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.687) f1_micro: (test=0.992) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.914) roc_auc_ovr_weighted: (test=0.997) total time= 42.9s [CV 5/5] END leaf_size=20, n_neighbors=6, weights=uniform; f1_macro: (test=0.754) f1_micro: (test=0.992) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.957) roc_auc_ovr_weighted: (test=0.997) total time= 42.3s [CV 1/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.812) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.936) roc_auc_ovr_weighted: (test=0.997) total time= 37.8s [CV 2/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.941) roc_auc_ovr_weighted: (test=0.997) total time= 37.3s [CV 3/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.801) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.997) total time= 37.0s [CV 4/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.728) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.914) roc_auc_ovr_weighted: (test=0.997) total time= 39.2s [CV 5/5] END leaf_size=20, n_neighbors=6, weights=distance; f1_macro: (test=0.790) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.957) roc_auc_ovr_weighted: (test=0.997) total time= 37.2s [CV 1/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.717) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.938) roc_auc_ovr_weighted: (test=0.997) total time= 43.6s [CV 2/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.690) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.945) roc_auc_ovr_weighted: (test=0.998) total time= 46.0s [CV 3/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.682) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.943) roc_auc_ovr_weighted: (test=0.997) total time= 44.1s [CV 4/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.683) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.918) roc_auc_ovr_weighted: (test=0.998) total time= 44.7s [CV 5/5] END leaf_size=20, n_neighbors=9, weights=uniform; f1_macro: (test=0.733) f1_micro: (test=0.991) f1_weighted: (test=0.991) roc_auc_ovr: (test=0.960) roc_auc_ovr_weighted: (test=0.998) total time= 44.8s [CV 1/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.820) f1_micro: (test=0.993) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.938) roc_auc_ovr_weighted: (test=0.997) total time= 38.5s [CV 2/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.755) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.945) roc_auc_ovr_weighted: (test=0.998) total time= 39.7s [CV 3/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.787) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.943) roc_auc_ovr_weighted: (test=0.998) total time= 40.1s [CV 4/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.725) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.918) roc_auc_ovr_weighted: (test=0.998) total time= 38.8s [CV 5/5] END leaf_size=20, n_neighbors=9, weights=distance; f1_macro: (test=0.791) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.960) roc_auc_ovr_weighted: (test=0.998) total time= 39.0s [CV 1/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.677) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.998) total time= 45.2s [CV 2/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.679) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.958) roc_auc_ovr_weighted: (test=0.998) total time= 44.9s [CV 3/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.671) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.998) total time= 45.9s [CV 4/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.673) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.998) total time= 45.1s [CV 5/5] END leaf_size=20, n_neighbors=12, weights=uniform; f1_macro: (test=0.676) f1_micro: (test=0.990) f1_weighted: (test=0.990) roc_auc_ovr: (test=0.962) roc_auc_ovr_weighted: (test=0.998) total time= 45.4s [CV 1/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.818) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.939) roc_auc_ovr_weighted: (test=0.998) total time= 42.9s [CV 2/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.757) f1_micro: (test=0.993) f1_weighted: (test=0.993) roc_auc_ovr: (test=0.958) roc_auc_ovr_weighted: (test=0.998) total time= 40.4s [CV 3/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.787) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.944) roc_auc_ovr_weighted: (test=0.998) total time= 40.3s [CV 4/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.723) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.935) roc_auc_ovr_weighted: (test=0.998) total time= 41.8s [CV 5/5] END leaf_size=20, n_neighbors=12, weights=distance; f1_macro: (test=0.770) f1_micro: (test=0.992) f1_weighted: (test=0.992) roc_auc_ovr: (test=0.962) roc_auc_ovr_weighted: (test=0.998) total time= 39.3s Predicting malware class on test cases Scoring and appending results Done fitting with KNN
# Explore classification report for first run so we know what values we're interested in
e = r[0]
print("PCA features:", e['feat'])
print("knn_k:", e['knn_k'])
print("knn_weights:", e['knn_weights'])
print("knn_leaf_size:", e['knn_leaf_size'])
print("runtime:", e['t'])
print("score:", e['score'])
#print("cross-validation results:", e['cvr'])
for k in e['cr'].keys():
print(str(k) + ":", e['cr'][k])
# Confusion matrix (true values are in rows, predicted values in columns)
cm = np.array(e['cm'])
print('Confusion matrix:')
print(cm)
print()
# Correct values
cm_cor = cm * np.identity(cm.shape[0])
print('Correct values:')
print(cm_cor)
print()
# Incorrect values
cm_incor = cm - cm_cor
print('Incorrect values:')
print(cm_incor)
print()
# True positives
cm_tp = sum(cm_cor)
print('True positives:')
print(cm_tp)
print()
# False positives
cm_fp = np.sum(cm_incor, axis=0)
print('False positives:')
print(cm_fp)
print(np.sum(cm_fp))
print()
# False negatives
cm_fn = np.sum(cm_incor, axis=1)
print('False negatives:')
print(cm_fn)
print(np.sum(cm_fp))
print()
# True negatives
cm_tn = np.zeros(cm.shape[0])
for i in range(0, cm.shape[0]):
# Zero out the ith row and ith column
cm_tmp = cm.copy()
cm_tmp[i,:] = 0
cm_tmp[:,i] = 0
cm_tn[i] = np.sum(cm_tmp)
print('True negatives:')
print(cm_tn)
print(np.sum(cm_tn))
print()
PCA features: 2
knn_k: 12
knn_weights: distance
knn_leaf_size: 15
runtime: 2760.9946868419647
score: 1.0
BENIGN: {'precision': 0.9917280568329077, 'recall': 0.9896547014271049, 'f1-score': 0.9906902943313672, 'support': 453926}
Bot: {'precision': 0.7622641509433963, 'recall': 0.5166240409207161, 'f1-score': 0.6158536585365854, 'support': 391}
DDoS: {'precision': 0.9000755001887505, 'recall': 0.9311853153680921, 'f1-score': 0.9153661579806124, 'support': 25605}
DoS GoldenEye: {'precision': 0.8952520802741067, 'recall': 0.8882952889752307, 'f1-score': 0.8917601170160898, 'support': 2059}
DoS Hulk: {'precision': 0.9583306204831044, 'recall': 0.9594133623030962, 'f1-score': 0.9588716857397234, 'support': 46025}
DoS Slowhttptest: {'precision': 0.9155435759209344, 'recall': 0.9263636363636364, 'f1-score': 0.920921825576141, 'support': 1100}
DoS slowloris: {'precision': 0.9645328719723183, 'recall': 0.9620362381363244, 'f1-score': 0.9632829373650107, 'support': 1159}
FTP-Patator: {'precision': 0.9467005076142132, 'recall': 0.9401386263390044, 'f1-score': 0.943408156813152, 'support': 1587}
Heartbleed: {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 2}
PortScan: {'precision': 0.9918775676608023, 'recall': 0.9958438287153653, 'f1-score': 0.9938567410875612, 'support': 31760}
SSH-Patator: {'precision': 0.9237560192616372, 'recall': 0.9754237288135593, 'f1-score': 0.9488870568837592, 'support': 1180}
Web Attack - Brute Force: {'precision': 0.6740506329113924, 'recall': 0.707641196013289, 'f1-score': 0.6904376012965965, 'support': 301}
Web Attack - XSS: {'precision': 0.3977272727272727, 'recall': 0.2692307692307692, 'f1-score': 0.3211009174311927, 'support': 130}
micro avg: {'precision': 0.9835113120891097, 'recall': 0.9835304524746782, 'f1-score': 0.9835208821887708, 'support': 565225}
macro avg: {'precision': 0.8709106812916028, 'recall': 0.8509115948158608, 'f1-score': 0.8580336269275223, 'support': 565225}
weighted avg: {'precision': 0.9835767736876729, 'recall': 0.9835304524746782, 'f1-score': 0.983519526836341, 'support': 565225}
Confusion matrix:
[[449230 61 2284 186 1615 91 36 83 0 246
79 13 2]
[ 177 202 11 0 1 0 0 0 0 0
0 0 0]
[ 1479 2 23843 12 267 0 0 0 0 2
0 0 0]
[ 200 0 12 1829 16 0 0 1 0 0
0 1 0]
[ 1508 0 334 15 44157 0 2 0 0 9
0 0 0]
[ 77 0 1 0 0 1019 3 0 0 0
0 0 0]
[ 35 0 1 0 1 3 1115 0 0 2
1 1 0]
[ 94 0 0 0 0 0 0 1492 0 0
0 1 0]
[ 0 0 0 0 0 0 0 0 2 0
0 0 0]
[ 109 0 4 1 17 0 0 0 0 31628
0 0 1]
[ 26 0 0 0 3 0 0 0 0 0
1151 0 0]
[ 24 0 0 0 0 0 0 0 0 0
14 213 50]
[ 7 0 0 0 0 0 0 0 0 0
1 87 35]]
Correct values:
[[4.4923e+05 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
0.0000e+00]
[0.0000e+00 2.0200e+02 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
0.0000e+00]
[0.0000e+00 0.0000e+00 2.3843e+04 0.0000e+00 0.0000e+00 0.0000e+00
0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
0.0000e+00]
[0.0000e+00 0.0000e+00 0.0000e+00 1.8290e+03 0.0000e+00 0.0000e+00
0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
0.0000e+00]
[0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 4.4157e+04 0.0000e+00
0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
0.0000e+00]
[0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 1.0190e+03
0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
0.0000e+00]
[0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
1.1150e+03 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
0.0000e+00]
[0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
0.0000e+00 1.4920e+03 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
0.0000e+00]
[0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
0.0000e+00 0.0000e+00 2.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
0.0000e+00]
[0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
0.0000e+00 0.0000e+00 0.0000e+00 3.1628e+04 0.0000e+00 0.0000e+00
0.0000e+00]
[0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 1.1510e+03 0.0000e+00
0.0000e+00]
[0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 2.1300e+02
0.0000e+00]
[0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
3.5000e+01]]
Incorrect values:
[[0.000e+00 6.100e+01 2.284e+03 1.860e+02 1.615e+03 9.100e+01 3.600e+01
8.300e+01 0.000e+00 2.460e+02 7.900e+01 1.300e+01 2.000e+00]
[1.770e+02 0.000e+00 1.100e+01 0.000e+00 1.000e+00 0.000e+00 0.000e+00
0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00]
[1.479e+03 2.000e+00 0.000e+00 1.200e+01 2.670e+02 0.000e+00 0.000e+00
0.000e+00 0.000e+00 2.000e+00 0.000e+00 0.000e+00 0.000e+00]
[2.000e+02 0.000e+00 1.200e+01 0.000e+00 1.600e+01 0.000e+00 0.000e+00
1.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00]
[1.508e+03 0.000e+00 3.340e+02 1.500e+01 0.000e+00 0.000e+00 2.000e+00
0.000e+00 0.000e+00 9.000e+00 0.000e+00 0.000e+00 0.000e+00]
[7.700e+01 0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 3.000e+00
0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00]
[3.500e+01 0.000e+00 1.000e+00 0.000e+00 1.000e+00 3.000e+00 0.000e+00
0.000e+00 0.000e+00 2.000e+00 1.000e+00 1.000e+00 0.000e+00]
[9.400e+01 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00]
[0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00]
[1.090e+02 0.000e+00 4.000e+00 1.000e+00 1.700e+01 0.000e+00 0.000e+00
0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00]
[2.600e+01 0.000e+00 0.000e+00 0.000e+00 3.000e+00 0.000e+00 0.000e+00
0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00]
[2.400e+01 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
0.000e+00 0.000e+00 0.000e+00 1.400e+01 0.000e+00 5.000e+01]
[7.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
0.000e+00 0.000e+00 0.000e+00 1.000e+00 8.700e+01 0.000e+00]]
True positives:
[4.4923e+05 2.0200e+02 2.3843e+04 1.8290e+03 4.4157e+04 1.0190e+03
1.1150e+03 1.4920e+03 2.0000e+00 3.1628e+04 1.1510e+03 2.1300e+02
3.5000e+01]
False positives:
[3736. 63. 2647. 214. 1920. 94. 41. 84. 0. 259. 95. 103.
53.]
9309.0
False negatives:
[4696. 189. 1762. 230. 1868. 81. 44. 95. 0. 132. 29. 88.
95.]
9309.0
True negatives:
[107563. 564771. 536973. 562952. 517280. 564031. 564025. 563554. 565223.
533206. 563950. 564821. 565042.]
6773391.0
# Results
# Load scoring.csv
fh = open("scores.csv", "r")
rdr = csv.reader(fh)
ascores = list(rdr)
fh.close()
print(ascores)
ct = 0 #counter
for e in r:
# Append scores
e['f1_macro_score'] = ascores[ct+1][2]
e['f1_macro_leaf_size'] = ascores[ct+1][3]
e['f1_macro_neighbors'] = ascores[ct+1][4]
e['f1_macro_weights'] = ascores[ct+1][5]
e['f1_macro_point_label'] = '(k=' + str(e['f1_macro_neighbors']) + ',wt=' + \
str(e['f1_macro_weights']) + ',sz=' + str(e['f1_macro_leaf_size']) + ')'
e['roc_auc_ovr_score'] = ascores[ct+1][14]
e['roc_auc_ovr_leaf_size'] = ascores[ct+1][15]
e['roc_auc_ovr_neighbors'] = ascores[ct+1][16]
e['roc_auc_ovr_weights'] = ascores[ct+1][17]
e['roc_auc_ovr_point_label'] = '(k=' + str(e['roc_auc_ovr_neighbors']) + ',wt=' + \
str(e['roc_auc_ovr_weights']) + ',sz=' + str(e['roc_auc_ovr_leaf_size']) + ')'
# Calcs; need to do some calcs to get true positive rate and false positive rate
cm = np.array(e['cm'])
# Correct values
cm_cor = cm * np.identity(cm.shape[0])
# Incorrect values
cm_incor = cm - cm_cor
# True positives
cm_tp = cm_cor
cm_tp_sum = np.sum(cm_tp)
# False positives
cm_fp = np.sum(cm_incor, axis=0)
cm_fp_sum = np.sum(cm_fp)
# False negatives
cm_fn = np.sum(cm_incor, axis=1)
cm_fn_sum = np.sum(cm_fp)
# True negatives
cm_tn = np.zeros(cm.shape[0])
for i in range(0, cm.shape[0]):
# Zero out the ith row and ith column
cm_tmp = cm.copy()
cm_tmp[i,:] = 0
cm_tmp[:,i] = 0
cm_tn[i] = np.sum(cm_tmp)
cm_tn_sum = np.sum(cm_tn)
# True positive rate = recall = sensitivity = TP / (TP + FN)
e['tpr'] = np.round(cm_tp_sum / (cm_tp_sum + cm_fn_sum), 4)
# False positive rate = FP / (FP + TN)
e['fpr'] = np.round(cm_fp_sum / (cm_fp_sum + cm_tn_sum), 4)
# True negative rate = specificity = TN / (TN + FP)
e['tnr'] = round(cm_tn_sum / (cm_tn_sum + cm_fp_sum), 4)
# Create point label for graph
e['point_label'] = '(feat=' + str(e['feat']) + ', k=' + str(e['knn_k']) + \
', leaf=' + str(e['knn_leaf_size']) + ', wt=' + e['knn_weights'] + ')'
# Results
print("Run #", ct)
print("\tPCA features:", e['feat'])
#print("\tscore:", e['score'])
#print("\tknn_k:", e['knn_k'])
#print("\tknn_weights:", e['knn_weights'])
#print("\tknn_leave_size:", e['knn_leaf_size'])
print("\tf1_macro_score:", e['f1_macro_score'])
print("\tf1_macro_neighbors:", e['f1_macro_neighbors'])
print("\tf1_macro_weights:", e['f1_macro_weights'])
print("\tf1_macro_leaf_size:", e['f1_macro_leaf_size'])
print("\troc_auc_ovr_score:", e['roc_auc_ovr_score'])
print("\troc_auc_ovr_neighbors:", e['roc_auc_ovr_neighbors'])
print("\troc_auc_ovr_weights:", e['roc_auc_ovr_weights'])
print("\troc_auc_ovr_leaf_size:", e['roc_auc_ovr_leaf_size'])
print("\truntime (sec):", e['t'])
print("\ttrue positive rate:", e['tpr'])
print("\tfalse positive rate:", e['fpr'])
print("\ttrue negative rate:", e['tnr'])
print("\tG-Mean:", np.sqrt(e['tpr'] * e['tnr']))
print()
#print("\tclassification report:")
#print()
#print(e['cr'])
#print()
#print("\tconfusion matrix:")
#print()
#print(e['cm'])
#print()
ct += 1
[['run', 'pca_cols', 'f1_macro_score', 'f1_macro_leaf_size', 'f1_macro_n_neighbors', 'f1_macro_weights', 'f1_micro_score', 'f1_micro_leaf_size', 'f1_micro_n_neighbors', 'f1_micro_weights', 'f1_weighted_score', 'f1_weighted_leaf_size', 'f1_weighted_n_neighbors', 'f1_weighted_weights', 'roc_auc_ovr_score', 'roc_auc_ovr_leaf_size', 'roc_auc_ovr_n_neighbors', 'roc_auc_ovr_weights', 'roc_auc_ovr_weighted_score', 'roc_auc_ovr_weighted_leaf_size', 'roc_auc_ovr_weighted_n_neighbors', 'roc_auc_ovr_weighted_weights'], ['0', '2', '0.774', '15', '6', 'distance', '0.983', '15', '6', 'distance', '0.983', '15', '6', 'distance', '0.952', '15', '12', 'distance', '0.994', '15', '12', 'distance'], ['1', '3', '0.778', '15', '12', 'distance', '0.984', '15', '3', 'distance', '0.984', '15', '3', 'distance', '0.953', '15', '12', 'distance', '0.995', '15', '12', 'distance'], ['2', '4', '0.788', '15', '6', 'distance', '0.99', '15', '3', 'distance', '0.99', '15', '3', 'distance', '0.955', '15', '12', 'distance', '0.997', '15', '12', 'distance'], ['3', '5', '0.82', '15', '9', 'distance', '0.993', '15', '3', 'distance', '0.993', '15', '3', 'distance', '0.962', '15', '12', 'distance', '0.998', '15', '9', 'distance']] Run # 0 PCA features: 2 f1_macro_score: 0.774 f1_macro_neighbors: 6 f1_macro_weights: distance f1_macro_leaf_size: 15 roc_auc_ovr_score: 0.952 roc_auc_ovr_neighbors: 12 roc_auc_ovr_weights: distance roc_auc_ovr_leaf_size: 15 runtime (sec): 2760.9946868419647 true positive rate: 0.9835 false positive rate: 0.0014 true negative rate: 0.9986 G-Mean: 0.9910212409428973 Run # 1 PCA features: 3 f1_macro_score: 0.778 f1_macro_neighbors: 12 f1_macro_weights: distance f1_macro_leaf_size: 15 roc_auc_ovr_score: 0.953 roc_auc_ovr_neighbors: 12 roc_auc_ovr_weights: distance roc_auc_ovr_leaf_size: 15 runtime (sec): 3016.4819836616516 true positive rate: 0.9844 false positive rate: 0.0013 true negative rate: 0.9987 G-Mean: 0.9915242205816256 Run # 2 PCA features: 4 f1_macro_score: 0.788 f1_macro_neighbors: 6 f1_macro_weights: distance f1_macro_leaf_size: 15 roc_auc_ovr_score: 0.955 roc_auc_ovr_neighbors: 12 roc_auc_ovr_weights: distance roc_auc_ovr_leaf_size: 15 runtime (sec): 3179.8600294589996 true positive rate: 0.9898 false positive rate: 0.0009 true negative rate: 0.9991 G-Mean: 0.9944391283532643 Run # 3 PCA features: 5 f1_macro_score: 0.82 f1_macro_neighbors: 9 f1_macro_weights: distance f1_macro_leaf_size: 15 roc_auc_ovr_score: 0.962 roc_auc_ovr_neighbors: 12 roc_auc_ovr_weights: distance roc_auc_ovr_leaf_size: 15 runtime (sec): 3396.462958574295 true positive rate: 0.9929 false positive rate: 0.0006 true negative rate: 0.9994 G-Mean: 0.9961446983244954
# Set plotly renderer
pio.renderers.default = 'notebook'
# Read scores, filter just the ones we want
df_scores = pd.read_csv('scores2.csv')
#df_scores['point_label'] = '(k=' + df_scores['n_neighbors'].astype(str) + ',' + 'wt=' + \
# df_scores['weights'].astype(str) + ',sz=' + df_scores['leaf_size'].astype(str) + ')'
df_scores['point_label'] = 'k=' + df_scores['n_neighbors'].astype(str)
df_scores = df_scores[(df_scores['method'] == 'f1_macro') | (df_scores['method'] == 'roc_auc_ovr')]
display(df_scores)
# Create data frame out of results array
df_result = pd.DataFrame(r)
# Plot scores
#fig1 = px.scatter(df_result, x='feat', y='f1_macro_score', text='f1_macro_point_label')
fig1 = px.scatter(df_scores, x='feat', y='score', text='point_label', facet_col='method', \
template='plotly_white', facet_col_spacing=0.1)
fig1.update_traces(textposition='top center')
fig1.update_layout(xaxis_range=[1, 6])
fig1.update_layout(yaxis_range=[0.75, 1.0])
fig1.update_xaxes(title="Number of features")
fig1.update_yaxes(title="Score")
fig1.show()
# Plot ROC curve
#roc = px.scatter(df_result, x='fpr', y='tpr', text='point_label')
#roc.update_traces(textposition='bottom right')
#roc.update_layout(showlegend=False)
#roc.show()
# Read scores
df_scores1 = pd.read_csv('scores.csv')
df_scores1['point_label'] = df_scores1['time_min'].astype(str) + ' min (f1=' + df_scores1['f1_macro_score'].astype(str) + ')'
df_scores1 = df_scores1.drop(
df_scores1.columns.difference(['run', 'feat', 'f1_macro_score', 'time_min', 'point_label']), axis=1)
display(df_scores1)
# Plot runtime
fig2 = px.scatter(df_scores1, x='feat', y='time_min', text='point_label', template='plotly_white')
fig2.update_traces(textposition='top center')
fig2.update_layout(xaxis_range=[1, 6])
fig2.update_layout(yaxis_range=[40, 60])
fig2.update_xaxes(title="Number of features")
fig2.update_yaxes(title="Runtime (min)")
fig2.show()
| run | feat | method | score | leaf_size | n_neighbors | weights | point_label | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2 | f1_macro | 0.774 | 15 | 6 | distance | k=6 |
| 3 | 0 | 2 | roc_auc_ovr | 0.952 | 15 | 12 | distance | k=12 |
| 5 | 1 | 3 | f1_macro | 0.778 | 15 | 12 | distance | k=12 |
| 8 | 1 | 3 | roc_auc_ovr | 0.953 | 15 | 12 | distance | k=12 |
| 10 | 2 | 4 | f1_macro | 0.788 | 15 | 6 | distance | k=6 |
| 13 | 2 | 4 | roc_auc_ovr | 0.955 | 15 | 12 | distance | k=12 |
| 15 | 3 | 5 | f1_macro | 0.820 | 15 | 9 | distance | k=9 |
| 18 | 3 | 5 | roc_auc_ovr | 0.962 | 15 | 12 | distance | k=12 |
| run | feat | f1_macro_score | time_min | point_label | |
|---|---|---|---|---|---|
| 0 | 0 | 2 | 0.774 | 46 | 46 min (f1=0.774) |
| 1 | 1 | 3 | 0.778 | 50 | 50 min (f1=0.778) |
| 2 | 2 | 4 | 0.788 | 53 | 53 min (f1=0.788) |
| 3 | 3 | 5 | 0.820 | 57 | 57 min (f1=0.82) |
# Confusion matrix for best model
cm = pd.DataFrame(r[3]['cm'])
cm.columns = list(r[3]['cr'].keys())[0:13]
cm.index = list(r[3]['cr'].keys())[0:13]
display(cm)
# Reduced confusion matrix (one vs rest) - consider any malware as the positive class
cmr = np.zeros([2, 2]).astype(int)
cmarray = np.array(r[3]['cm'])
cmr[1, 1] = cmarray[0, 0] #TN=actually benign and predicted to be benign
cmr[1, 0] = np.sum(cmarray[0, 1:]) #FP=actually benign but predicted to be malware
cmr[0, 1] = np.sum(cmarray[1:, 0]) #FN=actually malware but predicted to be benign
cmr[0, 0] = np.sum(cmarray) - cmarray[0, 0] #TP=actually malware and predicted to be malware
cmr = pd.DataFrame(cmr)
cmr.columns = ['Malware', 'Benign']
cmr.index = ['Malware', 'Benign']
display(cmr)
| BENIGN | Bot | DDoS | DoS GoldenEye | DoS Hulk | DoS Slowhttptest | DoS slowloris | FTP-Patator | Heartbleed | PortScan | SSH-Patator | Web Attack - Brute Force | Web Attack - XSS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| BENIGN | 451913 | 45 | 519 | 67 | 926 | 74 | 31 | 44 | 0 | 240 | 60 | 4 | 3 |
| Bot | 127 | 257 | 3 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| DDoS | 121 | 0 | 25142 | 3 | 339 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| DoS GoldenEye | 82 | 0 | 0 | 1973 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 2 | 0 |
| DoS Hulk | 656 | 0 | 277 | 1 | 45084 | 0 | 1 | 0 | 0 | 6 | 0 | 0 | 0 |
| DoS Slowhttptest | 36 | 0 | 0 | 0 | 0 | 1058 | 4 | 0 | 0 | 0 | 0 | 0 | 2 |
| DoS slowloris | 26 | 0 | 0 | 0 | 2 | 5 | 1122 | 0 | 0 | 2 | 1 | 1 | 0 |
| FTP-Patator | 29 | 0 | 0 | 0 | 0 | 0 | 0 | 1556 | 0 | 0 | 1 | 1 | 0 |
| Heartbleed | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 |
| PortScan | 65 | 0 | 2 | 0 | 14 | 0 | 0 | 0 | 0 | 31678 | 0 | 0 | 1 |
| SSH-Patator | 23 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1156 | 0 | 0 |
| Web Attack - Brute Force | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 32 | 212 | 52 |
| Web Attack - XSS | 6 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 86 | 36 |
| Malware | Benign | |
|---|---|---|
| Malware | 113312 | 1176 |
| Benign | 2013 | 451913 |
precision = $\frac{TP}{TP + FP}$ = 0.983 \ recall = $\frac{TP}{TP + FN}$ = 0.990 \ false positive rate = $\frac{FP}{FP + TN}$ = 0.00443
# Calculate highest and lowest tpr's, fpr's
# Create empty dataframe like the confusion matrix
rates = pd.DataFrame(columns=['malware', 'samples', 'fp', 'fn', 'tpr', 'fpr', 'tpr_text', 'fpr_text'])
# Iterate over the non-BENIGN columns in cm
for i in range(1, cm.shape[1]):
samples = np.sum(cm.iloc[i,:])
fp = cm.iloc[0, i]
fn = cm.iloc[i, 0]
tpr = cm.iloc[i, i] / (cm.iloc[i, i] + cm.iloc[i, 0])
fpr = cm.iloc[0, i] / (cm.iloc[0, i] + cm.iloc[0, 0])
tpr_text = str(round(tpr * 100, 3)) + '% (' + str(fn) +' FNs)'
fpr_text = str(round(fpr * 100, 3)) + '% (' + str(fp) +' FPs)'
newRow = pd.DataFrame({'malware': cm.columns[i], 'samples': samples, 'fp': fp, 'fn': fn, 'tpr': tpr, 'fpr': fpr, \
'tpr_text': tpr_text, 'fpr_text': fpr_text}, index=[i-1])
rates = pd.concat([rates, newRow], axis=0)
display(rates)
| malware | samples | fp | fn | tpr | fpr | tpr_text | fpr_text | |
|---|---|---|---|---|---|---|---|---|
| 0 | Bot | 391 | 45 | 127 | 0.669271 | 0.0001 | 66.927% (127 FNs) | 0.01% (45 FPs) |
| 1 | DDoS | 25605 | 519 | 121 | 0.99521 | 0.001147 | 99.521% (121 FNs) | 0.115% (519 FPs) |
| 2 | DoS GoldenEye | 2059 | 67 | 82 | 0.960097 | 0.000148 | 96.01% (82 FNs) | 0.015% (67 FPs) |
| 3 | DoS Hulk | 46025 | 926 | 656 | 0.985658 | 0.002045 | 98.566% (656 FNs) | 0.204% (926 FPs) |
| 4 | DoS Slowhttptest | 1100 | 74 | 36 | 0.967093 | 0.000164 | 96.709% (36 FNs) | 0.016% (74 FPs) |
| 5 | DoS slowloris | 1159 | 31 | 26 | 0.977352 | 0.000069 | 97.735% (26 FNs) | 0.007% (31 FPs) |
| 6 | FTP-Patator | 1587 | 44 | 29 | 0.981703 | 0.000097 | 98.17% (29 FNs) | 0.01% (44 FPs) |
| 7 | Heartbleed | 2 | 0 | 0 | 1.0 | 0.0 | 100.0% (0 FNs) | 0.0% (0 FPs) |
| 8 | PortScan | 31760 | 240 | 65 | 0.997952 | 0.000531 | 99.795% (65 FNs) | 0.053% (240 FPs) |
| 9 | SSH-Patator | 1180 | 60 | 23 | 0.980492 | 0.000133 | 98.049% (23 FNs) | 0.013% (60 FPs) |
| 10 | Web Attack - Brute Force | 301 | 4 | 5 | 0.976959 | 0.000009 | 97.696% (5 FNs) | 0.001% (4 FPs) |
| 11 | Web Attack - XSS | 130 | 3 | 6 | 0.857143 | 0.000007 | 85.714% (6 FNs) | 0.001% (3 FPs) |
# Plot tpr
rates = rates.sort_values(by = ['tpr'], ascending=False)
fig1 = px.bar(rates, y='malware', x='tpr', template='plotly_white', orientation='h', \
log_x=False, text='tpr_text')
fig1.update_traces(marker_color='black')
fig1.update_xaxes(title="True positive rate")
fig1.update_yaxes(title="Malware")
#fig1.update_layout(title={'text': 'Hardest malware to identify', 'xanchor': 'left', 'yanchor': 'top'})
fig1.show()
# Plot tpr
rates = rates.sort_values(by = ['fpr'], ascending=True)
fig2 = px.bar(rates, y='malware', x='fpr', template='plotly_white', orientation='h', \
log_x=False, text='fpr_text')
fig2.update_traces(marker_color='black')
fig2.update_xaxes(title="False positive rate")
fig2.update_yaxes(title="Malware")
#fig2.update_layout(title={'text': 'Malware most prone to false positives', 'xanchor': 'left', 'yanchor': 'top'})
fig2.show()